From f6882f84e53e18efbdb461cf56729ff6362e70c5 Mon Sep 17 00:00:00 2001 From: achingbrain Date: Thu, 16 Jan 2020 09:36:22 +0000 Subject: [PATCH 1/2] fix: only output unixfs things This is a `unixfs-importer` but it's possible to output CIDs that resolve to `dag-raw` nodes if your data is small and you set `rawLeaves` and `reduceSingleLeafToSelf` to true. In that case you'll get a `dag-raw` node as the output. This makes it impossible to set metadata on small files with `rawLeaves` set to true. BREAKING CHANGE: If your data is below the chunk size, and you have `rawLeaves` and `reduceSingleLeafToSelf` set to true, you'll get a CID that resolves to a bona fide UnixFS file back with metadata and all that good stuff instead of a `dag-raw` node. --- package.json | 1 - src/dag-builder/file/index.js | 18 ++++++++++++++++++ src/utils/persist.js | 2 +- test/builder.spec.js | 2 +- test/chunker-custom.spec.js | 2 +- test/importer.spec.js | 2 +- 6 files changed, 22 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index 41b6cab..62200b9 100644 --- a/package.json +++ b/package.json @@ -48,7 +48,6 @@ "ipld-in-memory": "^3.0.0", "it-buffer-stream": "^1.0.0", "it-last": "^1.0.0", - "multihashes": "^0.4.14", "nyc": "^15.0.0", "sinon": "^8.0.4" }, diff --git a/src/dag-builder/file/index.js b/src/dag-builder/file/index.js index 0f32c33..f44c5e5 100644 --- a/src/dag-builder/file/index.js +++ b/src/dag-builder/file/index.js @@ -9,6 +9,7 @@ const { } = require('ipld-dag-pb') const all = require('it-all') const parallelBatch = require('it-parallel-batch') +const mc = require('multicodec') const dagBuilders = { flat: require('./flat'), @@ -52,6 +53,23 @@ const reduce = (file, ipld, options) => { if (leaves.length === 1 && leaves[0].single && options.reduceSingleLeafToSelf) { const leaf = leaves[0] + if (leaf.cid.codec === 'raw') { + // only one leaf node which is a buffer + const buffer = await ipld.get(leaf.cid) + + leaf.unixfs = new UnixFS({ + type: 'file', + mtime: file.mtime, + mode: file.mode, + data: buffer + }) + + const node = new DAGNode(leaf.unixfs.marshal()) + + leaf.cid = await ipld.put(node, mc.DAG_PB, options) + leaf.size = node.size + } + return { cid: leaf.cid, path: file.path, diff --git a/src/utils/persist.js b/src/utils/persist.js index 69e5af7..e6970b6 100644 --- a/src/utils/persist.js +++ b/src/utils/persist.js @@ -1,6 +1,6 @@ 'use strict' -const mh = require('multihashes') +const mh = require('multihashing-async').multihash const mc = require('multicodec') const persist = (node, ipld, options) => { diff --git a/test/builder.spec.js b/test/builder.spec.js index bbeb9d0..c9d5522 100644 --- a/test/builder.spec.js +++ b/test/builder.spec.js @@ -4,7 +4,7 @@ const chai = require('chai') chai.use(require('dirty-chai')) const expect = chai.expect -const mh = require('multihashes') +const mh = require('multihashing-async').multihash const IPLD = require('ipld') const inMemory = require('ipld-in-memory') const UnixFS = require('ipfs-unixfs') diff --git a/test/chunker-custom.spec.js b/test/chunker-custom.spec.js index 2dd00f0..8bc80dd 100644 --- a/test/chunker-custom.spec.js +++ b/test/chunker-custom.spec.js @@ -69,5 +69,5 @@ describe('custom chunker', function () { cid: await inmem.put(Buffer.from('hello world'), mc.RAW) } } - it('works with single part', fromPartsTest(single, 11)) + it('works with single part', fromPartsTest(single, 19)) }) diff --git a/test/importer.spec.js b/test/importer.spec.js index cf6784e..0f5b658 100644 --- a/test/importer.spec.js +++ b/test/importer.spec.js @@ -260,7 +260,7 @@ strategies.forEach((strategy) => { type: 'directory' }, '200Bytes.txt with raw leaves': extend({}, baseFiles['200Bytes.txt'], { - cid: 'zb2rhXrz1gkCv8p4nUDZRohY6MzBE9C3HVTVDP72g6Du3SD9Q', + cid: 'QmQmZQxSKQppbsWfVzBvg59Cn3DKtsNVQ94bjAxg2h3Lb8', size: 200 }) }, strategyOverrides[strategy]) From 08736916eccbca56646110a12679c81234eedc5a Mon Sep 17 00:00:00 2001 From: achingbrain Date: Thu, 16 Jan 2020 10:44:11 +0000 Subject: [PATCH 2/2] feat: default to using raw leaves Builds on #49. A separate PR as it's slightly contentious. Sets default DAG construction to be a `dag-pb` root node, `dag-pb` intermediate nodes and `ipld-raw` nodes for leaves. This will make parsing ever so slightly faster and DAG sizes ever so slightly smaller as there is no protobuf wrapper for the actual file data. Currently you may end up with `ipld-raw` leaves or `dag-pb` leaves that contain UnixFS entries with type 'file' or 'raw' depending on where the importer is invoked from. E.g. to generate the same CIDs as go-IPFS, `ipfs.add` will result in a balanced DAG with UnixFS leaf nodes with a type 'file', `ipfs.files.write` will result in a trickle DAG with UnixFS leaf nodes of a type `raw`, and specifying CID version 1 will get you `ipld-raw` leaf nodes and whatever tree strategy you specifed, default balanced. I think this is chaos, we should use `ipld-raw` leaf types everywhere and only offer options to change the DAG structure, not leaf types. --- README.md | 4 +-- src/dag-builder/file/index.js | 7 ++--- src/index.js | 10 +++++-- test/chunker-custom.spec.js | 2 +- test/hash-parity-with-go-ipfs.spec.js | 3 ++- test/importer.spec.js | 38 +++++++++++++-------------- 6 files changed, 36 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index da32517..ade47c6 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,8 @@ The input's file paths and directory structure will be preserved in the [`dag-pb - `onlyHash` (boolean, defaults to false): Only chunk and hash - do not write to disk - `hashAlg` (string): multihash hashing algorithm to use - `cidVersion` (integer, default 0): the CID version to use when storing the data (storage keys are based on the CID, _including_ it's version) -- `rawLeaves` (boolean, defaults to false): When a file would span multiple DAGNodes, if this is true the leaf nodes will not be wrapped in `UnixFS` protobufs and will instead contain the raw file bytes -- `leafType` (string, defaults to `'file'`) what type of UnixFS node leaves should be - can be `'file'` or `'raw'` (ignored when `rawLeaves` is `true`) +- `rawLeaves` (boolean, defaults to true): When a file would span multiple DAGNodes, if this is true the leaf nodes will not be wrapped in `UnixFS` protobufs and will instead contain the raw file bytes +- `leafType` (string, defaults to `'file'`) what type of UnixFS node leaves should be - can be `'file'` or `'raw'` (ignored when `rawLeaves` is explicitly set to `true`) - `blockWriteConcurrency` (positive integer, defaults to 10) How many blocks to hash and write to the block store concurrently. For small numbers of large files this should be high (e.g. 50). - `fileImportConcurrency` (number, defaults to 50) How many files to import concurrently. For large numbers of small files this should be high (e.g. 50). diff --git a/src/dag-builder/file/index.js b/src/dag-builder/file/index.js index f44c5e5..bf946e0 100644 --- a/src/dag-builder/file/index.js +++ b/src/dag-builder/file/index.js @@ -53,7 +53,8 @@ const reduce = (file, ipld, options) => { if (leaves.length === 1 && leaves[0].single && options.reduceSingleLeafToSelf) { const leaf = leaves[0] - if (leaf.cid.codec === 'raw') { + // TODO: fix this for when onlyHash is passed + if (leaf.cid.codec === 'raw' && !options.onlyHash) { // only one leaf node which is a buffer const buffer = await ipld.get(leaf.cid) @@ -87,8 +88,8 @@ const reduce = (file, ipld, options) => { const links = leaves .filter(leaf => { - if (leaf.cid.codec === 'raw' && leaf.size) { - return true + if (leaf.cid.codec === 'raw') { + return Boolean(leaf.size) } if (!leaf.unixfs.data && leaf.unixfs.fileSize()) { diff --git a/src/index.js b/src/index.js index 052acff..adba151 100644 --- a/src/index.js +++ b/src/index.js @@ -6,7 +6,7 @@ const mergeOptions = require('merge-options').bind({ ignoreUndefined: true }) const defaultOptions = { chunker: 'fixed', strategy: 'balanced', // 'flat', 'trickle' - rawLeaves: false, + rawLeaves: true, onlyHash: false, reduceSingleLeafToSelf: true, codec: 'dag-pb', @@ -48,7 +48,7 @@ module.exports = async function * (source, ipld, options = {}) { opts.rawLeaves = true } - // go-ifps trickle dag defaults to unixfs raw leaves, balanced dag defaults to file leaves + // go-ipfs trickle dag defaults to unixfs raw leaves, balanced dag defaults to file leaves if (options.strategy === 'trickle') { opts.leafType = 'raw' opts.reduceSingleLeafToSelf = false @@ -58,6 +58,12 @@ module.exports = async function * (source, ipld, options = {}) { opts.codec = options.format } + if (options.leafType && options.rawLeaves == null) { + // if the user has specified a custom UnixFS leaf type and not specified + // raw leaves, really do not use raw leaves + opts.rawLeaves = false + } + let dagBuilder if (typeof options.dagBuilder === 'function') { diff --git a/test/chunker-custom.spec.js b/test/chunker-custom.spec.js index 8bc80dd..5d445b3 100644 --- a/test/chunker-custom.spec.js +++ b/test/chunker-custom.spec.js @@ -46,7 +46,7 @@ describe('custom chunker', function () { for await (const part of importer([{ path: 'test', content }], inmem, { chunker })) { - expect(part.size).to.equal(116) + expect(part.size).to.equal(104) } }) diff --git a/test/hash-parity-with-go-ipfs.spec.js b/test/hash-parity-with-go-ipfs.spec.js index 94e44fd..66a4ebd 100644 --- a/test/hash-parity-with-go-ipfs.spec.js +++ b/test/hash-parity-with-go-ipfs.spec.js @@ -25,7 +25,8 @@ const expectedHashes = { strategies.forEach(strategy => { const options = { - strategy: strategy + strategy: strategy, + rawLeaves: false } describe('go-ipfs interop using importer:' + strategy, () => { diff --git a/test/importer.spec.js b/test/importer.spec.js index 0f5b658..652d47d 100644 --- a/test/importer.spec.js +++ b/test/importer.spec.js @@ -47,7 +47,7 @@ const baseFiles = { path: '200Bytes.txt' }, '1.2MiB.txt': { - cid: 'QmW7BDxEbGqxxSYVtn3peNPQgdDXbWkoQ6J1EFYAEuQV3Q', + cid: 'QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY', size: 1258000, type: 'file', path: '1.2MiB.txt' @@ -64,19 +64,19 @@ const strategyBaseFiles = { flat: baseFiles, balanced: extend({}, baseFiles, { '1.2MiB.txt': { - cid: 'QmW7BDxEbGqxxSYVtn3peNPQgdDXbWkoQ6J1EFYAEuQV3Q', + cid: 'QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY', type: 'file' } }), trickle: extend({}, baseFiles, { '200Bytes.txt': { - cid: 'QmY8bwnoKAKvJ8qtyPhWNxSS6sxiGVTJ9VpdQffs2KB5pE', + cid: 'QmagyRwMfYhczYNv5SvcJc8xxXjZQBTTHS2jEqNMva2mYT', size: 200, type: 'file', path: '200Bytes.txt' }, '1.2MiB.txt': { - cid: 'QmfAxsHrpaLLuhbqqbo9KQyvQNawMnVSwutYoJed75pnco', + cid: 'QmQLTvhjmSa7657mKdSfTjxFBdwxmK8n9tZC9Xdp9DtxWY', type: 'file' } }) @@ -91,25 +91,25 @@ const strategies = [ const strategyOverrides = { balanced: { 'foo-big': { - cid: 'QmaFgyFJUP4fxFySJCddg2Pj6rpwSywopWk87VEVv52RSj', + cid: 'QmR6jJgszuuWxVDVc4PCRa6domvNf6guQK7pboZnbwnht1', path: 'foo-big', size: 1335478, type: 'directory' }, pim: { - cid: 'QmY8a78tx6Tk6naDgWCgTsd9EqGrUJRrH7dDyQhjyrmH2i', + cid: 'QmSA5QoZVieErY7TuQihgUiVc8BB6ZzgwnKvZSHcdTyFiK', path: 'pim', size: 1335744, type: 'directory' }, 'pam/pum': { - cid: 'QmY8a78tx6Tk6naDgWCgTsd9EqGrUJRrH7dDyQhjyrmH2i', + cid: 'QmSA5QoZVieErY7TuQihgUiVc8BB6ZzgwnKvZSHcdTyFiK', path: 'pam/pum', size: 1335744, type: 'directory' }, pam: { - cid: 'QmRgdtzNx1H1BPJqShdhvWZ2D4DA2HUgZJ3XLtoXei27Av', + cid: 'QmTQcBCQvVR68R3r4eJwKL4VwDci1TCH5cj1njmcQ95Eif', path: 'pam', size: 2671269, type: 'directory' @@ -117,25 +117,25 @@ const strategyOverrides = { }, trickle: { 'foo-big': { - cid: 'QmaKbhFRy9kcCbcwrLsqYHWMiY44BDYkqTCMpAxDdd2du2', + cid: 'QmR6jJgszuuWxVDVc4PCRa6domvNf6guQK7pboZnbwnht1', path: 'foo-big', size: 1334657, type: 'directory' }, pim: { - cid: 'QmbWGdnua4YuYpWJb7fE25PRbW9GbKKLqq9Ucmnsg2gxnt', + cid: 'QmZmrY8nLks6N66f68c1biFpay3dSVy9PGvVRZLgT2T2gD', path: 'pim', size: 1334923, type: 'directory' }, 'pam/pum': { - cid: 'QmbWGdnua4YuYpWJb7fE25PRbW9GbKKLqq9Ucmnsg2gxnt', + cid: 'QmZmrY8nLks6N66f68c1biFpay3dSVy9PGvVRZLgT2T2gD', path: 'pam/pum', size: 1334923, type: 'directory' }, pam: { - cid: 'QmSuh47G9Qm3PFv1zziojtHxqCjuurSdtWAzxLxoKJPq2U', + cid: 'Qmf1necujkKZHrYyj56eKFKAi9DPH8UyZg44KaTciso2Sc', path: 'pam', size: 2669627, type: 'directory' @@ -147,19 +147,19 @@ const strategyOverrides = { type: 'file' }, 'foo/bar': { - cid: 'QmTGMxKPzSGNBDp6jhTwnZxGW6w1S9ciyycRJ4b2qcQaHK', + cid: 'QmcKJQd9cH6sip78HbjkGVLshjxtjfvnZ5ih169kZUK3Yg', size: 0, path: 'foo/bar', type: 'directory' }, foo: { - cid: 'Qme4A8fZmwfZESappfPcxSMTZVACiEzhHKtYRMuM1hbkDp', + cid: 'QmQG6CVVTiz1TwnnueRMBZxB6d5WGJakRocHe4KGz5yHzg', size: 0, path: 'foo', type: 'directory' }, 'small.txt': { - cid: 'QmXmZ3qT328JxWtQXqrmvma2FmPp7tMdNiSuYvVJ5QRhKs', + cid: 'QmX2YDaeaAFbyNK4gat5jQMJhZL171n5Qummc5jcMsmXUu', size: 15, type: 'file', path: 'small.txt' @@ -225,7 +225,7 @@ strategies.forEach((strategy) => { }), 'foo-big': { path: 'foo-big', - cid: 'QmaFgyFJUP4fxFySJCddg2Pj6rpwSywopWk87VEVv52RSj', + cid: 'QmR6jJgszuuWxVDVc4PCRa6domvNf6guQK7pboZnbwnht1', size: 1328120, type: 'directory' }, @@ -237,7 +237,7 @@ strategies.forEach((strategy) => { }), pim: { path: 'pim', - cid: 'QmY8a78tx6Tk6naDgWCgTsd9EqGrUJRrH7dDyQhjyrmH2i', + cid: 'QmSA5QoZVieErY7TuQihgUiVc8BB6ZzgwnKvZSHcdTyFiK', size: 1328386, type: 'directory' }, @@ -248,13 +248,13 @@ strategies.forEach((strategy) => { type: 'directory' }, 'pam/pum': { - cid: 'QmY8a78tx6Tk6naDgWCgTsd9EqGrUJRrH7dDyQhjyrmH2i', + cid: 'QmSA5QoZVieErY7TuQihgUiVc8BB6ZzgwnKvZSHcdTyFiK', path: 'pam/pum', size: 1328386, type: 'directory' }, pam: { - cid: 'QmRgdtzNx1H1BPJqShdhvWZ2D4DA2HUgZJ3XLtoXei27Av', + cid: 'QmTQcBCQvVR68R3r4eJwKL4VwDci1TCH5cj1njmcQ95Eif', path: 'pam', size: 2656553, type: 'directory'