// Copyright 2017 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>

#include <digest/digest.h>
#include <zircon/device/device.h>
#include <zircon/device/vfs.h>

#include <cobalt-client/cpp/timer.h>
#include <fbl/auto_call.h>
#include <fbl/ref_ptr.h>
#include <fbl/string_piece.h>
#include <fs/metrics.h>
#include <lib/fdio/debug.h>
#include <lib/fdio/vfs.h>
#include <lib/sync/completion.h>
#include <zircon/status.h>
#include <zircon/syscalls.h>

#define ZXDEBUG 0

#include <blobfs/blobfs.h>
#include <blobfs/iterator/allocated-extent-iterator.h>
#include <blobfs/iterator/block-iterator.h>
#include <blobfs/iterator/extent-iterator.h>
#include <blobfs/iterator/node-populator.h>
#include <blobfs/iterator/vector-extent-iterator.h>

#include <utility>

namespace blobfs {
namespace {

using digest::Digest;
using digest::MerkleTree;

// RAII interface for registering latency events.
class LatencyEvent {
public:
    LatencyEvent(cobalt_client::Histogram<fs::VnodeMetrics::kHistogramBuckets>* histogram,
                 bool collect)
        : timer_(collect), histogram_(histogram) {}
    LatencyEvent(LatencyEvent&& rhs)
        : timer_(std::move(rhs.timer_)), histogram_(std::move(rhs.histogram_)) {}
    ~LatencyEvent() {
        zx::duration latency = timer_.End();
        if (latency.get() > 0) {
            ZX_DEBUG_ASSERT(histogram_ != nullptr);
            histogram_->Add(latency.get());
        }
    }

private:
    cobalt_client::Timer timer_;
    cobalt_client::Histogram<fs::VnodeMetrics::kHistogramBuckets>* histogram_;
};

// A wrapper around "Enqueue" for content which risks being larger
// than the writeback buffer.
//
// For content which is smaller than 3/4 the size of the writeback buffer: the
// content is enqueued to |work| without flushing.
//
// For content which is larger than 3/4 the size of the writeback buffer: flush
// the data by enqueueing it to the writeback thread in chunks until the
// remainder is small enough to comfortably fit within the writeback buffer.
zx_status_t EnqueuePaginated(fbl::unique_ptr<WritebackWork>* work, Blobfs* blobfs, VnodeBlob* vn,
                             const zx::vmo& vmo, uint64_t relative_block, uint64_t absolute_block,
                             uint64_t nblocks) {
    const size_t kMaxChunkBlocks = (3 * blobfs->WritebackCapacity()) / 4;
    uint64_t delta_blocks = fbl::min(nblocks, kMaxChunkBlocks);
    while (nblocks > 0) {
        (*work)->Enqueue(vmo, relative_block, absolute_block, delta_blocks);
        relative_block += delta_blocks;
        absolute_block += delta_blocks;
        nblocks -= delta_blocks;
        delta_blocks = fbl::min(nblocks, kMaxChunkBlocks);
        if (nblocks) {
            fbl::unique_ptr<WritebackWork> tmp;
            zx_status_t status = blobfs->CreateWork(&tmp, vn);
            if (status != ZX_OK) {
                return status;
            }
            if ((status = blobfs->EnqueueWork(std::move(*work), EnqueueType::kData)) != ZX_OK) {
                return status;
            }
            *work = std::move(tmp);
        }
    }
    return ZX_OK;
}

} // namespace

zx_status_t VnodeBlob::Verify() const {
    TRACE_DURATION("blobfs", "Blobfs::Verify");
    fs::Ticker ticker(blobfs_->LocalMetrics().Collecting());

    const void* data = inode_.blob_size ? GetData() : nullptr;
    const void* tree = inode_.blob_size ? GetMerkle() : nullptr;
    const uint64_t data_size = inode_.blob_size;
    const uint64_t merkle_size = MerkleTree::GetTreeLength(data_size);
    // TODO(smklein): We could lazily verify more of the VMO if
    // we could fault in pages on-demand.
    //
    // For now, we aggressively verify the entire VMO up front.
    Digest digest(GetKey());
    zx_status_t status =
        MerkleTree::Verify(data, data_size, tree, merkle_size, 0, data_size, digest);
    blobfs_->LocalMetrics().UpdateMerkleVerify(data_size, merkle_size, ticker.End());

    if (status != ZX_OK) {
        char name[Digest::kLength * 2 + 1];
        ZX_ASSERT(digest.ToString(name, sizeof(name)) == ZX_OK);
        FS_TRACE_ERROR("blobfs verify(%s) Failure: %s\n", name, zx_status_get_string(status));
    }

    return status;
}

zx_status_t VnodeBlob::InitVmos() {
    TRACE_DURATION("blobfs", "Blobfs::InitVmos");

    if (mapping_.vmo()) {
        return ZX_OK;
    }

    uint64_t data_blocks = BlobDataBlocks(inode_);
    uint64_t merkle_blocks = MerkleTreeBlocks(inode_);
    uint64_t num_blocks = data_blocks + merkle_blocks;

    if (num_blocks == 0) {
        // No need to initialize VMO for null blob.
        return ZX_OK;
    }

    // Reverts blob back to uninitialized state on error.
    auto cleanup = fbl::MakeAutoCall([this]() { BlobCloseHandles(); });

    size_t vmo_size;
    if (mul_overflow(num_blocks, kBlobfsBlockSize, &vmo_size)) {
        FS_TRACE_ERROR("Multiplication overflow");
        return ZX_ERR_OUT_OF_RANGE;
    }

    zx_status_t status = mapping_.CreateAndMap(vmo_size, "blob");
    if (status != ZX_OK) {
        FS_TRACE_ERROR("Failed to initialize vmo; error: %d\n", status);
        return status;
    }
    if ((status = blobfs_->AttachVmo(mapping_.vmo(), &vmoid_)) != ZX_OK) {
        FS_TRACE_ERROR("Failed to attach VMO to block device; error: %d\n", status);
        return status;
    }

    if ((inode_.header.flags & kBlobFlagLZ4Compressed) != 0) {
        if ((status = InitCompressed()) != ZX_OK) {
            return status;
        }
    } else {
        if ((status = InitUncompressed()) != ZX_OK) {
            return status;
        }
    }
    if ((status = Verify()) != ZX_OK) {
        return status;
    }

    cleanup.cancel();
    return ZX_OK;
}

zx_status_t VnodeBlob::InitCompressed() {
    TRACE_DURATION("blobfs", "Blobfs::InitCompressed", "size", inode_.blob_size, "blocks",
                   inode_.block_count);
    fs::Ticker ticker(blobfs_->LocalMetrics().Collecting());
    fs::ReadTxn txn(blobfs_);
    uint32_t merkle_blocks = MerkleTreeBlocks(inode_);

    fzl::OwnedVmoMapper compressed_mapper;
    uint32_t compressed_blocks = (inode_.block_count - merkle_blocks);
    size_t compressed_size;
    if (mul_overflow(compressed_blocks, kBlobfsBlockSize, &compressed_size)) {
        FS_TRACE_ERROR("Multiplication overflow\n");
        return ZX_ERR_OUT_OF_RANGE;
    }
    zx_status_t status = compressed_mapper.CreateAndMap(compressed_size, "compressed-blob");
    if (status != ZX_OK) {
        FS_TRACE_ERROR("Failed to initialized compressed vmo; error: %d\n", status);
        return status;
    }
    vmoid_t compressed_vmoid;
    status = blobfs_->AttachVmo(compressed_mapper.vmo(), &compressed_vmoid);
    if (status != ZX_OK) {
        FS_TRACE_ERROR("Failed to attach commpressed VMO to blkdev: %d\n", status);
        return status;
    }

    auto detach =
        fbl::MakeAutoCall([this, &compressed_vmoid]() { blobfs_->DetachVmo(compressed_vmoid); });

    const uint64_t kDataStart = DataStartBlock(blobfs_->Info());
    AllocatedExtentIterator extent_iter(blobfs_->GetAllocator(), GetMapIndex());
    BlockIterator block_iter(&extent_iter);

    // Read the uncompressed merkle tree into the start of the blob's VMO.
    status = StreamBlocks(&block_iter, merkle_blocks,
                          [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
                              txn.Enqueue(vmoid_, vmo_offset, dev_offset + kDataStart, length);
                              return ZX_OK;
                          });
    if (status != ZX_OK) {
        return status;
    }

    // Read the compressed blocks into the compressed VMO, accounting for the merkle blocks
    // which have already been seen.
    ZX_DEBUG_ASSERT(block_iter.BlockIndex() == merkle_blocks);

    status = StreamBlocks(&block_iter, compressed_blocks,
                          [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
                              txn.Enqueue(compressed_vmoid, vmo_offset - merkle_blocks,
                                          dev_offset + kDataStart, length);
                              return ZX_OK;
                          });

    if (status != ZX_OK) {
        return status;
    }

    if ((status = txn.Transact()) != ZX_OK) {
        FS_TRACE_ERROR("Failed to flush read transaction: %d\n", status);
        return status;
    }

    fs::Duration read_time = ticker.End();
    ticker.Reset();

    // Decompress the compressed data into the target buffer.
    size_t target_size = inode_.blob_size;
    status = Decompressor::Decompress(GetData(), &target_size, compressed_mapper.start(),
                                      &compressed_size);
    if (status != ZX_OK) {
        FS_TRACE_ERROR("Failed to decompress data: %d\n", status);
        return status;
    } else if (target_size != inode_.blob_size) {
        FS_TRACE_ERROR("Failed to fully decompress blob (%zu of %zu expected)\n", target_size,
                       inode_.blob_size);
        return ZX_ERR_IO_DATA_INTEGRITY;
    }

    blobfs_->LocalMetrics().UdpateMerkleDecompress(compressed_blocks * kBlobfsBlockSize,
                                                   inode_.blob_size, read_time, ticker.End());
    return ZX_OK;
}

zx_status_t VnodeBlob::InitUncompressed() {
    TRACE_DURATION("blobfs", "Blobfs::InitUncompressed", "size", inode_.blob_size, "blocks",
                   inode_.block_count);
    fs::Ticker ticker(blobfs_->LocalMetrics().Collecting());
    fs::ReadTxn txn(blobfs_);
    AllocatedExtentIterator extent_iter(blobfs_->GetAllocator(), GetMapIndex());
    BlockIterator block_iter(&extent_iter);
    // Read both the uncompressed merkle tree and data.
    const uint64_t blob_data_blocks = BlobDataBlocks(inode_);
    const uint64_t merkle_blocks = MerkleTreeBlocks(inode_);
    if (blob_data_blocks + merkle_blocks > std::numeric_limits<uint32_t>::max()) {
        return ZX_ERR_IO_DATA_INTEGRITY;
    }
    const uint32_t length = static_cast<uint32_t>(blob_data_blocks + merkle_blocks);
    const uint64_t data_start = DataStartBlock(blobfs_->Info());
    zx_status_t status = StreamBlocks(
        &block_iter, length, [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
            txn.Enqueue(vmoid_, vmo_offset, dev_offset + data_start, length);
            return ZX_OK;
        });

    if (status != ZX_OK) {
        return status;
    }

    status = txn.Transact();
    if (status != ZX_OK) {
        return status;
    }
    blobfs_->LocalMetrics().UpdateMerkleDiskRead(length * kBlobfsBlockSize, ticker.End());
    return status;
}

void VnodeBlob::PopulateInode(uint32_t node_index) {
    ZX_DEBUG_ASSERT(map_index_ == 0);
    SetState(kBlobStateReadable);
    map_index_ = node_index;
    Inode* inode = blobfs_->GetNode(node_index);
    inode_ = *inode;
}

uint64_t VnodeBlob::SizeData() const {
    if (GetState() == kBlobStateReadable) {
        return inode_.blob_size;
    }
    return 0;
}

VnodeBlob::VnodeBlob(Blobfs* bs, const Digest& digest)
    : CacheNode(digest), blobfs_(bs), flags_(kBlobStateEmpty), syncing_(false),
      clone_watcher_(this) {}

VnodeBlob::VnodeBlob(Blobfs* bs)
    : blobfs_(bs), flags_(kBlobStateEmpty | kBlobFlagDirectory), syncing_(false),
      clone_watcher_(this) {}

void VnodeBlob::BlobCloseHandles() {
    mapping_.Reset();
    readable_event_.reset();
}

zx_status_t VnodeBlob::SpaceAllocate(uint64_t size_data) {
    TRACE_DURATION("blobfs", "Blobfs::SpaceAllocate", "size_data", size_data);
    fs::Ticker ticker(blobfs_->LocalMetrics().Collecting());

    if (GetState() != kBlobStateEmpty) {
        return ZX_ERR_BAD_STATE;
    }

    auto write_info = fbl::make_unique<WritebackInfo>();

    // Initialize the inode with known fields.
    memset(inode_.merkle_root_hash, 0, Digest::kLength);
    inode_.blob_size = size_data;
    inode_.block_count = MerkleTreeBlocks(inode_) + static_cast<uint32_t>(BlobDataBlocks(inode_));

    // Special case for the null blob: We skip the write phase.
    if (inode_.blob_size == 0) {
        zx_status_t status = blobfs_->ReserveNodes(1, &write_info->node_indices);
        if (status != ZX_OK) {
            return status;
        }
        map_index_ = write_info->node_indices[0].index();
        write_info_ = std::move(write_info);

        if ((status = Verify()) != ZX_OK) {
            return status;
        }
        SetState(kBlobStateDataWrite);
        if ((status = WriteMetadata()) != ZX_OK) {
            FS_TRACE_ERROR("Null blob metadata fail: %d\n", status);
            return status;
        }
        return ZX_OK;
    }

    fbl::Vector<ReservedExtent> extents;
    fbl::Vector<ReservedNode> nodes;

    // Reserve space for the blob.
    zx_status_t status = blobfs_->ReserveBlocks(inode_.block_count, &extents);
    if (status != ZX_OK) {
        return status;
    }
    if (extents.size() > kMaxBlobExtents) {
        FS_TRACE_ERROR("Error: Block reservation requires too many extents (%zu vs %zu max)\n",
                       extents.size(), kMaxBlobExtents);
        return ZX_ERR_BAD_STATE;
    }
    const ExtentCountType extent_count = static_cast<ExtentCountType>(extents.size());

    // Reserve space for all the nodes necessary to contain this blob.
    size_t node_count = NodePopulator::NodeCountForExtents(extent_count);
    status = blobfs_->ReserveNodes(node_count, &nodes);
    if (status != ZX_OK) {
        return status;
    }

    if (inode_.blob_size >= kCompressionMinBytesSaved) {
        size_t max = Compressor::BufferMax(inode_.blob_size);
        status = write_info->compressed_blob.CreateAndMap(max, "compressed-blob");
        if (status != ZX_OK) {
            return status;
        }
        status = write_info->compressor.Initialize(write_info->compressed_blob.start(),
                                                   write_info->compressed_blob.size());
        if (status != ZX_OK) {
            FS_TRACE_ERROR("blobfs: Failed to initialize compressor: %d\n", status);
            return status;
        }
    }

    // Open VMOs, so we can begin writing after allocate succeeds.
    fzl::OwnedVmoMapper mapping;
    if ((status = mapping.CreateAndMap(inode_.block_count * kBlobfsBlockSize, "blob")) != ZX_OK) {
        return status;
    }
    if ((status = blobfs_->AttachVmo(mapping.vmo(), &vmoid_)) != ZX_OK) {
        return status;
    }

    map_index_ = nodes[0].index();
    mapping_ = std::move(mapping);
    write_info->extents = std::move(extents);
    write_info->node_indices = std::move(nodes);
    write_info_ = std::move(write_info);

    SetState(kBlobStateDataWrite);
    blobfs_->LocalMetrics().UpdateAllocation(size_data, ticker.End());
    return ZX_OK;
}

void* VnodeBlob::GetData() const {
    return fs::GetBlock(kBlobfsBlockSize, mapping_.start(), MerkleTreeBlocks(inode_));
}

void* VnodeBlob::GetMerkle() const {
    return mapping_.start();
}

zx_status_t VnodeBlob::WriteMetadata() {
    TRACE_DURATION("blobfs", "Blobfs::WriteMetadata");
    assert(GetState() == kBlobStateDataWrite);

    zx_status_t status;
    fbl::unique_ptr<WritebackWork> wb;
    if ((status = blobfs_->CreateWork(&wb, this)) != ZX_OK) {
        return status;
    }

    // Update the on-disk hash.
    memcpy(inode_.merkle_root_hash, GetKey(), Digest::kLength);

    // All data has been written to the containing VMO.
    SetState(kBlobStateReadable);
    if (readable_event_.is_valid()) {
        status = readable_event_.signal(0u, ZX_USER_SIGNAL_0);
        if (status != ZX_OK) {
            SetState(kBlobStateError);
            return status;
        }
    }

    atomic_store(&syncing_, true);

    if (inode_.block_count) {
        // We utilize the NodePopulator class to take our reserved blocks and nodes and fill the
        // persistent map with an allocated inode / container.

        // If |on_node| is invoked on a node, it means that node was necessary to represent this
        // blob. Persist the node back to durable storge.
        auto on_node = [this, &wb](const ReservedNode& node) {
            blobfs_->PersistNode(wb.get(), node.index());
        };

        // If |on_extent| is invoked on an extent, it was necessary to represent this blob. Persist
        // the allocation of these blocks back to durable storage.
        //
        // Additionally, because of the compression feature of blobfs, it is possible we reserved
        // more extents than this blob ended up using. Decrement |remaining_blocks| to track if we
        // should exit early.
        size_t remaining_blocks = inode_.block_count;
        auto on_extent = [this, &wb, &remaining_blocks](ReservedExtent& extent) {
            ZX_DEBUG_ASSERT(remaining_blocks > 0);
            if (remaining_blocks >= extent.extent().Length()) {
                // Consume the entire extent.
                remaining_blocks -= extent.extent().Length();
            } else {
                // Consume only part of the extent; we're done iterating.
                extent.SplitAt(static_cast<BlockCountType>(remaining_blocks));
                remaining_blocks = 0;
            }
            blobfs_->PersistBlocks(wb.get(), extent);
            if (remaining_blocks == 0) {
                return NodePopulator::IterationCommand::Stop;
            }
            return NodePopulator::IterationCommand::Continue;
        };

        Inode* mapped_inode = blobfs_->GetNode(map_index_);
        *mapped_inode = inode_;
        NodePopulator populator(blobfs_->GetAllocator(), std::move(write_info_->extents),
                                std::move(write_info_->node_indices));
        ZX_ASSERT(populator.Walk(on_node, on_extent) == ZX_OK);

        // Ensure all non-allocation flags are propagated to the inode.
        mapped_inode->header.flags |= (inode_.header.flags & kBlobFlagLZ4Compressed);
    } else {
        // Special case: Empty node.
        ZX_DEBUG_ASSERT(write_info_->node_indices.size() == 1);
        const ReservedNode& node = write_info_->node_indices[0];
        blobfs_->GetAllocator()->MarkInodeAllocated(node);
        blobfs_->PersistNode(wb.get(), node.index());
    }

    wb->SetSyncComplete();
    if ((status = blobfs_->EnqueueWork(std::move(wb), EnqueueType::kJournal)) != ZX_OK) {
        return status;
    }

    // Drop the write info, since we no longer need it.
    write_info_.reset();
    return status;
}

zx_status_t VnodeBlob::WriteInternal(const void* data, size_t len, size_t* actual) {
    TRACE_DURATION("blobfs", "Blobfs::WriteInternal", "data", data, "len", len);

    *actual = 0;
    if (len == 0) {
        return ZX_OK;
    }

    const uint32_t merkle_blocks = MerkleTreeBlocks(inode_);
    const size_t merkle_bytes = merkle_blocks * kBlobfsBlockSize;
    if (GetState() == kBlobStateDataWrite) {
        size_t to_write = fbl::min(len, inode_.blob_size - write_info_->bytes_written);
        size_t offset = write_info_->bytes_written + merkle_bytes;
        zx_status_t status = mapping_.vmo().write(data, offset, to_write);
        if (status != ZX_OK) {
            return status;
        }

        *actual = to_write;
        write_info_->bytes_written += to_write;

        if (write_info_->compressor.Compressing()) {
            if ((status = write_info_->compressor.Update(data, to_write)) != ZX_OK) {
                return status;
            }
            ConsiderCompressionAbort();
        }

        // More data to write.
        if (write_info_->bytes_written < inode_.blob_size) {
            return ZX_OK;
        }

        // Only write data to disk once we've buffered the file into memory.
        // This gives us a chance to try compressing the blob before we write it back.
        fbl::unique_ptr<WritebackWork> wb;
        if ((status = blobfs_->CreateWork(&wb, this)) != ZX_OK) {
            return status;
        }

        // In case the operation fails, forcibly reset the WritebackWork
        // to avoid asserting that no write requests exist on destruction.
        auto set_error = fbl::MakeAutoCall([&]() {
            if (wb != nullptr) {
                wb->Reset(ZX_ERR_BAD_STATE);
            }

            SetState(kBlobStateError);
        });

        if (write_info_->compressor.Compressing()) {
            if ((status = write_info_->compressor.End()) != ZX_OK) {
                return status;
            }
            ConsiderCompressionAbort();
        }

        // Since the merkle tree and data are co-allocated, use a block iterator
        // to parse their data in order.
        VectorExtentIterator extent_iter(write_info_->extents);
        BlockIterator block_iter(&extent_iter);

        // TODO(smklein): As an optimization, use the CreateInit/Update/Final
        // methods to create the merkle tree as we write data, rather than
        // waiting until the data is fully downloaded to create the tree.
        size_t merkle_size = MerkleTree::GetTreeLength(inode_.blob_size);
        fs::Duration generation_time;
        if (merkle_size > 0) {
            Digest digest;
            void* merkle_data = GetMerkle();
            const void* blob_data = GetData();
            // Tracking generation time.
            fs::Ticker ticker(blobfs_->LocalMetrics().Collecting());

            if ((status = MerkleTree::Create(blob_data, inode_.blob_size, merkle_data, merkle_size,
                                             &digest)) != ZX_OK) {
                return status;
            } else if (digest != GetKey()) {
                // Downloaded blob did not match provided digest.
                return ZX_ERR_IO_DATA_INTEGRITY;
            }

            status = StreamBlocks(&block_iter, merkle_blocks,
                                  [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
                                      return EnqueuePaginated(
                                          &wb, blobfs_, this, mapping_.vmo(), vmo_offset,
                                          dev_offset + blobfs_->DataStart(), length);
                                  });

            if (status != ZX_OK) {
                return status;
            }
            generation_time = ticker.End();
        } else if ((status = Verify()) != ZX_OK) {
            // Small blobs may not have associated Merkle Trees, and will
            // require validation, since we are not regenerating and checking
            // the digest.
            return status;
        }

        if (write_info_->compressor.Compressing()) {
            uint64_t blocks64 =
                fbl::round_up(write_info_->compressor.Size(), kBlobfsBlockSize) / kBlobfsBlockSize;
            ZX_DEBUG_ASSERT(blocks64 <= std::numeric_limits<uint32_t>::max());
            uint32_t blocks = static_cast<uint32_t>(blocks64);
            int64_t vmo_bias = -static_cast<int64_t>(merkle_blocks);
            ZX_DEBUG_ASSERT(block_iter.BlockIndex() + vmo_bias == 0);
            status = StreamBlocks(&block_iter, blocks,
                                  [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
                                      return EnqueuePaginated(
                                          &wb, blobfs_, this, write_info_->compressed_blob.vmo(),
                                          vmo_offset - merkle_blocks,
                                          dev_offset + blobfs_->DataStart(), length);
                                  });

            if (status != ZX_OK) {
                return status;
            }
            blocks += MerkleTreeBlocks(inode_);
            // By compressing, we used less blocks than we originally reserved.
            ZX_DEBUG_ASSERT(inode_.block_count > blocks);

            inode_.block_count = blocks;
            inode_.header.flags |= kBlobFlagLZ4Compressed;
        } else {
            uint64_t blocks64 =
                fbl::round_up(inode_.blob_size, kBlobfsBlockSize) / kBlobfsBlockSize;
            ZX_DEBUG_ASSERT(blocks64 <= std::numeric_limits<uint32_t>::max());
            uint32_t blocks = static_cast<uint32_t>(blocks64);
            status = StreamBlocks(&block_iter, blocks,
                                  [&](uint64_t vmo_offset, uint64_t dev_offset, uint32_t length) {
                                      return EnqueuePaginated(
                                          &wb, blobfs_, this, mapping_.vmo(), vmo_offset,
                                          dev_offset + blobfs_->DataStart(), length);
                                  });
            if (status != ZX_OK) {
                return status;
            }
        }

        // Enqueue the blob's final data work. Metadata must be enqueued separately.
        if ((status = blobfs_->EnqueueWork(std::move(wb), EnqueueType::kData)) != ZX_OK) {
            return status;
        }

        // No more data to write. Flush to disk.
        fs::Ticker ticker(blobfs_->LocalMetrics().Collecting()); // Tracking enqueue time.
        if ((status = WriteMetadata()) != ZX_OK) {
            return status;
        }

        blobfs_->LocalMetrics().UpdateClientWrite(to_write, merkle_size, ticker.End(),
                                                  generation_time);
        set_error.cancel();
        return ZX_OK;
    }

    return ZX_ERR_BAD_STATE;
}

void VnodeBlob::ConsiderCompressionAbort() {
    ZX_DEBUG_ASSERT(write_info_->compressor.Compressing());
    if (inode_.blob_size - kCompressionMinBytesSaved < write_info_->compressor.Size()) {
        write_info_->compressor.Reset();
        write_info_->compressed_blob.Reset();
    }
}

zx_status_t VnodeBlob::GetReadableEvent(zx_handle_t* out) {
    TRACE_DURATION("blobfs", "Blobfs::GetReadableEvent");
    zx_status_t status;
    // This is the first 'wait until read event' request received.
    if (!readable_event_.is_valid()) {
        status = zx::event::create(0, &readable_event_);
        if (status != ZX_OK) {
            return status;
        } else if (GetState() == kBlobStateReadable) {
            readable_event_.signal(0u, ZX_USER_SIGNAL_0);
        }
    }
    status = zx_handle_duplicate(readable_event_.get(), ZX_RIGHTS_BASIC, out);
    if (status != ZX_OK) {
        return status;
    }
    return sizeof(zx_handle_t);
}

zx_status_t VnodeBlob::CloneVmo(zx_rights_t rights, zx_handle_t* out) {
    TRACE_DURATION("blobfs", "Blobfs::CloneVmo", "rights", rights, "out", out);
    if (GetState() != kBlobStateReadable) {
        return ZX_ERR_BAD_STATE;
    }
    if (inode_.blob_size == 0) {
        return ZX_ERR_BAD_STATE;
    }
    zx_status_t status = InitVmos();
    if (status != ZX_OK) {
        return status;
    }

    // TODO(smklein): Only clone / verify the part of the vmo that
    // was requested.
    const size_t merkle_bytes = MerkleTreeBlocks(inode_) * kBlobfsBlockSize;
    zx::vmo clone;
    if ((status = mapping_.vmo().clone(ZX_VMO_CLONE_COPY_ON_WRITE, merkle_bytes, inode_.blob_size,
                                       &clone)) != ZX_OK) {
        return status;
    }

    // TODO(mdempsky): Push elsewhere.
    if ((status = clone.replace_as_executable(zx::handle(), &clone)) != ZX_OK) {
        return status;
    }

    if ((status = clone.replace(rights, &clone)) != ZX_OK) {
        return status;
    }
    *out = clone.release();

    if (clone_watcher_.object() == ZX_HANDLE_INVALID) {
        clone_watcher_.set_object(mapping_.vmo().get());
        clone_watcher_.set_trigger(ZX_VMO_ZERO_CHILDREN);

        // Keep a reference to "this" alive, preventing the blob
        // from being closed while someone may still be using the
        // underlying memory.
        //
        // We'll release it when no client-held VMOs are in use.
        clone_ref_ = fbl::RefPtr<VnodeBlob>(this);
        clone_watcher_.Begin(blobfs_->dispatcher());
    }

    return ZX_OK;
}

void VnodeBlob::HandleNoClones(async_dispatcher_t* dispatcher, async::WaitBase* wait,
                               zx_status_t status, const zx_packet_signal_t* signal) {
    ZX_DEBUG_ASSERT(status == ZX_OK);
    ZX_DEBUG_ASSERT((signal->observed & ZX_VMO_ZERO_CHILDREN) != 0);
    ZX_DEBUG_ASSERT(clone_watcher_.object() != ZX_HANDLE_INVALID);
    clone_watcher_.set_object(ZX_HANDLE_INVALID);
    clone_ref_ = nullptr;
}

zx_status_t VnodeBlob::ReadInternal(void* data, size_t len, size_t off, size_t* actual) {
    TRACE_DURATION("blobfs", "Blobfs::ReadInternal", "len", len, "off", off);

    if (GetState() != kBlobStateReadable) {
        return ZX_ERR_BAD_STATE;
    }

    if (inode_.blob_size == 0) {
        *actual = 0;
        return ZX_OK;
    }

    zx_status_t status = InitVmos();
    if (status != ZX_OK) {
        return status;
    }

    Digest d(GetKey());

    if (off >= inode_.blob_size) {
        *actual = 0;
        return ZX_OK;
    }
    if (len > (inode_.blob_size - off)) {
        len = inode_.blob_size - off;
    }

    const size_t merkle_bytes = MerkleTreeBlocks(inode_) * kBlobfsBlockSize;
    status = mapping_.vmo().read(data, merkle_bytes + off, len);
    if (status == ZX_OK) {
        *actual = len;
    }
    return status;
}

zx_status_t VnodeBlob::QueueUnlink() {
    flags_ |= kBlobFlagDeletable;
    // Attempt to purge in case the blob has been unlinked with no open fds
    return TryPurge();
}

zx_status_t VnodeBlob::VerifyBlob(Blobfs* bs, uint32_t node_index) {
    Inode* inode = bs->GetNode(node_index);
    Digest digest(inode->merkle_root_hash);
    fbl::AllocChecker ac;
    fbl::RefPtr<VnodeBlob> vn = fbl::AdoptRef(new (&ac) VnodeBlob(bs, digest));

    if (!ac.check()) {
        return ZX_ERR_NO_MEMORY;
    }

    vn->PopulateInode(node_index);

    // If we are unable to read in the blob from disk, this should also be a VerifyBlob error.
    // Since InitVmos calls Verify as its final step, we can just return its result here.
    return vn->InitVmos();
}

BlobCache& VnodeBlob::Cache() {
    return blobfs_->Cache();
}

bool VnodeBlob::ShouldCache() const {
    switch (GetState()) {
    // All "Valid", cacheable states, where the blob still exists on storage.
    case kBlobStateReadable:
        return true;
    default:
        return false;
    }
}

void VnodeBlob::ActivateLowMemory() {
    // We shouldn't be putting the blob into a low-memory state while it is still mapped.
    ZX_ASSERT(clone_watcher_.object() == ZX_HANDLE_INVALID);
    if (mapping_.vmo()) {
        blobfs_->DetachVmo(vmoid_);
    }
    mapping_.Reset();
}

VnodeBlob::~VnodeBlob() {
    ActivateLowMemory();
}

zx_status_t VnodeBlob::ValidateFlags(uint32_t flags) {
    if ((flags & ZX_FS_FLAG_DIRECTORY) && !IsDirectory()) {
        return ZX_ERR_NOT_DIR;
    }

    if (flags & ZX_FS_RIGHT_WRITABLE) {
        if (IsDirectory()) {
            return ZX_ERR_NOT_FILE;
        } else if (GetState() != kBlobStateEmpty) {
            return ZX_ERR_ACCESS_DENIED;
        }
    }
    return ZX_OK;
}

zx_status_t VnodeBlob::Readdir(fs::vdircookie_t* cookie, void* dirents, size_t len,
                               size_t* out_actual) {
    if (!IsDirectory()) {
        return ZX_ERR_NOT_DIR;
    }

    return blobfs_->Readdir(cookie, dirents, len, out_actual);
}

zx_status_t VnodeBlob::Read(void* data, size_t len, size_t off, size_t* out_actual) {
    TRACE_DURATION("blobfs", "VnodeBlob::Read", "len", len, "off", off);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->read, blobfs_->CollectingMetrics());

    if (IsDirectory()) {
        return ZX_ERR_NOT_FILE;
    }

    return ReadInternal(data, len, off, out_actual);
}

zx_status_t VnodeBlob::Write(const void* data, size_t len, size_t offset, size_t* out_actual) {
    TRACE_DURATION("blobfs", "VnodeBlob::Write", "len", len, "off", offset);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->write, blobfs_->CollectingMetrics());
    if (IsDirectory()) {
        return ZX_ERR_NOT_FILE;
    }
    return WriteInternal(data, len, out_actual);
}

zx_status_t VnodeBlob::Append(const void* data, size_t len, size_t* out_end, size_t* out_actual) {
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->append, blobfs_->CollectingMetrics());
    zx_status_t status = WriteInternal(data, len, out_actual);
    if (GetState() == kBlobStateDataWrite) {
        ZX_DEBUG_ASSERT(write_info_ != nullptr);
        *out_actual = write_info_->bytes_written;
    } else {
        *out_actual = inode_.blob_size;
    }
    return status;
}

zx_status_t VnodeBlob::Lookup(fbl::RefPtr<fs::Vnode>* out, fbl::StringPiece name) {
    TRACE_DURATION("blobfs", "VnodeBlob::Lookup", "name", name);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->look_up, blobfs_->CollectingMetrics());
    assert(memchr(name.data(), '/', name.length()) == nullptr);
    if (name == "." && IsDirectory()) {
        // Special case: Accessing root directory via '.'
        *out = fbl::RefPtr<VnodeBlob>(this);
        return ZX_OK;
    }

    if (!IsDirectory()) {
        return ZX_ERR_NOT_SUPPORTED;
    }

    zx_status_t status;
    Digest digest;
    if ((status = digest.Parse(name.data(), name.length())) != ZX_OK) {
        return status;
    }
    fbl::RefPtr<CacheNode> cache_node;
    if ((status = Cache().Lookup(digest, &cache_node)) != ZX_OK) {
        return status;
    }
    auto vnode = fbl::RefPtr<VnodeBlob>::Downcast(std::move(cache_node));
    blobfs_->LocalMetrics().UpdateLookup(vnode->SizeData());
    *out = std::move(vnode);
    return ZX_OK;
}

zx_status_t VnodeBlob::Getattr(vnattr_t* a) {
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->get_attr, blobfs_->CollectingMetrics());
    memset(a, 0, sizeof(vnattr_t));
    a->mode = (IsDirectory() ? V_TYPE_DIR : V_TYPE_FILE) | V_IRUSR;
    a->inode = fuchsia_io_INO_UNKNOWN;
    a->size = IsDirectory() ? 0 : SizeData();
    a->blksize = kBlobfsBlockSize;
    a->blkcount = inode_.block_count * (kBlobfsBlockSize / VNATTR_BLKSIZE);
    a->nlink = 1;
    a->create_time = 0;
    a->modify_time = 0;
    return ZX_OK;
}

zx_status_t VnodeBlob::Create(fbl::RefPtr<fs::Vnode>* out, fbl::StringPiece name, uint32_t mode) {
    TRACE_DURATION("blobfs", "VnodeBlob::Create", "name", name, "mode", mode);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->create, blobfs_->CollectingMetrics());
    assert(memchr(name.data(), '/', name.length()) == nullptr);

    if (!IsDirectory()) {
        return ZX_ERR_NOT_SUPPORTED;
    }

    Digest digest;
    zx_status_t status;
    if ((status = digest.Parse(name.data(), name.length())) != ZX_OK) {
        return status;
    }

    fbl::RefPtr<VnodeBlob> vn = fbl::AdoptRef(new VnodeBlob(blobfs_, std::move(digest)));
    if ((status = Cache().Add(vn)) != ZX_OK) {
        return status;
    }
    vn->fd_count_ = 1;
    *out = std::move(vn);
    return ZX_OK;
}

zx_status_t VnodeBlob::Truncate(size_t len) {
    TRACE_DURATION("blobfs", "VnodeBlob::Truncate", "len", len);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->truncate, blobfs_->CollectingMetrics());

    if (IsDirectory()) {
        return ZX_ERR_NOT_SUPPORTED;
    }

    return SpaceAllocate(len);
}

#ifdef __Fuchsia__

constexpr const char kFsName[] = "blobfs";

zx_status_t VnodeBlob::QueryFilesystem(fuchsia_io_FilesystemInfo* info) {
    static_assert(fbl::constexpr_strlen(kFsName) + 1 < fuchsia_io_MAX_FS_NAME_BUFFER,
                  "Blobfs name too long");

    memset(info, 0, sizeof(*info));
    info->block_size = kBlobfsBlockSize;
    info->max_filename_size = Digest::kLength * 2;
    info->fs_type = VFS_TYPE_BLOBFS;
    info->fs_id = blobfs_->GetFsId();
    info->total_bytes = blobfs_->Info().data_block_count * blobfs_->Info().block_size;
    info->used_bytes = blobfs_->Info().alloc_block_count * blobfs_->Info().block_size;
    info->total_nodes = blobfs_->Info().inode_count;
    info->used_nodes = blobfs_->Info().alloc_inode_count;
    strlcpy(reinterpret_cast<char*>(info->name), kFsName, fuchsia_io_MAX_FS_NAME_BUFFER);
    return ZX_OK;
}

zx_status_t VnodeBlob::GetDevicePath(size_t buffer_len, char* out_name, size_t* out_len) {
    ssize_t len = ioctl_device_get_topo_path(blobfs_->Fd(), out_name, buffer_len);
    if (len < 0) {
        return static_cast<zx_status_t>(len);
    }
    *out_len = len;
    return ZX_OK;
}
#endif

zx_status_t VnodeBlob::Unlink(fbl::StringPiece name, bool must_be_dir) {
    TRACE_DURATION("blobfs", "VnodeBlob::Unlink", "name", name, "must_be_dir", must_be_dir);
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->unlink, blobfs_->CollectingMetrics());
    assert(memchr(name.data(), '/', name.length()) == nullptr);

    if (!IsDirectory()) {
        return ZX_ERR_NOT_SUPPORTED;
    }

    zx_status_t status;
    Digest digest;
    if ((status = digest.Parse(name.data(), name.length())) != ZX_OK) {
        return status;
    }
    fbl::RefPtr<CacheNode> cache_node;
    if ((status = Cache().Lookup(digest, &cache_node)) != ZX_OK) {
        return status;
    }
    auto vnode = fbl::RefPtr<VnodeBlob>::Downcast(std::move(cache_node));
    blobfs_->LocalMetrics().UpdateLookup(vnode->SizeData());
    return vnode->QueueUnlink();
}

zx_status_t VnodeBlob::GetVmo(int flags, zx_handle_t* out) {
    TRACE_DURATION("blobfs", "VnodeBlob::GetVmo", "flags", flags);

    if (IsDirectory()) {
        return ZX_ERR_NOT_SUPPORTED;
    }
    if (flags & fuchsia_io_VMO_FLAG_WRITE) {
        return ZX_ERR_NOT_SUPPORTED;
    } else if (flags & fuchsia_io_VMO_FLAG_EXACT) {
        return ZX_ERR_NOT_SUPPORTED;
    }

    // Let clients map and set the names of their VMOs.
    zx_rights_t rights = ZX_RIGHTS_BASIC | ZX_RIGHT_MAP | ZX_RIGHTS_PROPERTY;
    // We can ignore fuchsia_io_VMO_FLAG_PRIVATE, since private / shared access
    // to the underlying VMO can both be satisfied with a clone due to
    // the immutability of blobfs blobs.
    rights |= (flags & fuchsia_io_VMO_FLAG_READ) ? ZX_RIGHT_READ : 0;
    rights |= (flags & fuchsia_io_VMO_FLAG_EXEC) ? ZX_RIGHT_EXECUTE : 0;
    return CloneVmo(rights, out);
}

void VnodeBlob::Sync(SyncCallback closure) {
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->sync, blobfs_->CollectingMetrics());
    if (atomic_load(&syncing_)) {
        blobfs_->Sync([this, evt = std::move(event), cb = std::move(closure)](zx_status_t status) {
            if (status != ZX_OK) {
                cb(status);
                return;
            }

            fs::WriteTxn sync_txn(blobfs_);
            sync_txn.EnqueueFlush();
            status = sync_txn.Transact();
            cb(status);
        });
    } else {
        closure(ZX_OK);
    }
}

void VnodeBlob::CompleteSync() {
    fsync(blobfs_->Fd());
    atomic_store(&syncing_, false);
}

fbl::RefPtr<VnodeBlob> VnodeBlob::CloneWatcherTeardown() {
    if (clone_watcher_.is_pending()) {
        clone_watcher_.Cancel();
        clone_watcher_.set_object(ZX_HANDLE_INVALID);
        return std::move(clone_ref_);
    }
    return nullptr;
}

zx_status_t VnodeBlob::Open(uint32_t flags, fbl::RefPtr<Vnode>* out_redirect) {
    fd_count_++;
    return ZX_OK;
}

zx_status_t VnodeBlob::Close() {
    LatencyEvent event(&blobfs_->GetMutableVnodeMetrics()->close, blobfs_->CollectingMetrics());
    ZX_DEBUG_ASSERT_MSG(fd_count_ > 0, "Closing blob with no fds open");
    fd_count_--;
    // Attempt purge in case blob was unlinked prior to close
    return TryPurge();
}

zx_status_t VnodeBlob::TryPurge() {
    if (Purgeable()) {
        return Purge();
    }
    return ZX_OK;
}

zx_status_t VnodeBlob::Purge() {
    ZX_DEBUG_ASSERT(fd_count_ == 0);
    ZX_DEBUG_ASSERT(Purgeable());

    if (GetState() == kBlobStateReadable) {
        // A readable blob should only be purged if it has been unlinked.
        ZX_ASSERT(DeletionQueued());
        fbl::unique_ptr<WritebackWork> wb;
        zx_status_t status = blobfs_->CreateWork(&wb, this);
        if (status != ZX_OK) {
            return status;
        }

        blobfs_->FreeInode(wb.get(), GetMapIndex());
        status = blobfs_->EnqueueWork(std::move(wb), EnqueueType::kJournal);
        if (status != ZX_OK) {
            return status;
        }
    }
    ZX_ASSERT(Cache().Evict(fbl::WrapRefPtr(this)) == ZX_OK);
    SetState(kBlobStatePurged);
    return ZX_OK;
}

} // namespace blobfs