1 // Copyright 2016 The Fuchsia Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "block.h"
6 
7 #include <ddk/debug.h>
8 #include <fbl/algorithm.h>
9 #include <fbl/auto_call.h>
10 #include <fbl/auto_lock.h>
11 #include <inttypes.h>
12 #include <pretty/hexdump.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/param.h>
17 #include <zircon/compiler.h>
18 
19 #include <utility>
20 
21 #include "trace.h"
22 
23 #define LOCAL_TRACE 0
24 
25 // 1MB max transfer (unless further restricted by ring size).
26 #define MAX_SCATTER 257
27 #define MAX_MAX_XFER ((MAX_SCATTER - 1) * PAGE_SIZE)
28 
29 #define PAGE_MASK (PAGE_SIZE - 1)
30 
31 namespace virtio {
32 
txn_complete(block_txn_t * txn,zx_status_t status)33 void BlockDevice::txn_complete(block_txn_t* txn, zx_status_t status) {
34     if (txn->pmt != ZX_HANDLE_INVALID) {
35         zx_pmt_unpin(txn->pmt);
36         txn->pmt = ZX_HANDLE_INVALID;
37     }
38     txn->completion_cb(txn->cookie, status, &txn->op);
39 }
40 
41 // DDK level ops
42 
43 // Optional: return the size (in bytes) of the readable/writable space of the device.  Will default
44 // to 0 (non-seekable) if this is unimplemented.
virtio_block_get_size(void * ctx)45 zx_off_t BlockDevice::virtio_block_get_size(void* ctx) {
46     LTRACEF("ctx %p\n", ctx);
47 
48     BlockDevice* bd = static_cast<BlockDevice*>(ctx);
49 
50     return bd->GetSize();
51 }
52 
GetInfo(block_info_t * info)53 void BlockDevice::GetInfo(block_info_t* info) {
54     memset(info, 0, sizeof(*info));
55     info->block_size = GetBlockSize();
56     info->block_count = GetSize() / GetBlockSize();
57     info->max_transfer_size = (uint32_t)(PAGE_SIZE * (ring_size - 2));
58 
59     // Limit max transfer to our worst case scatter list size.
60     if (info->max_transfer_size > MAX_MAX_XFER) {
61         info->max_transfer_size = MAX_MAX_XFER;
62     }
63 }
64 
virtio_block_query(void * ctx,block_info_t * info,size_t * bopsz)65 void BlockDevice::virtio_block_query(void* ctx, block_info_t* info, size_t* bopsz) {
66     BlockDevice* bd = static_cast<BlockDevice*>(ctx);
67     bd->GetInfo(info);
68     *bopsz = sizeof(block_txn_t);
69 }
70 
virtio_block_queue(void * ctx,block_op_t * bop,block_impl_queue_callback completion_cb,void * cookie)71 void BlockDevice::virtio_block_queue(void* ctx, block_op_t* bop,
72                                      block_impl_queue_callback completion_cb, void* cookie) {
73     BlockDevice* bd = static_cast<BlockDevice*>(ctx);
74     block_txn_t* txn = static_cast<block_txn_t*>((void*)bop);
75     txn->pmt = ZX_HANDLE_INVALID;
76     txn->completion_cb = completion_cb;
77     txn->cookie = cookie;
78     bd->SignalWorker(txn);
79 }
80 
virtio_block_ioctl(void * ctx,uint32_t op,const void * in_buf,size_t in_len,void * reply,size_t max,size_t * out_actual)81 zx_status_t BlockDevice::virtio_block_ioctl(void* ctx, uint32_t op, const void* in_buf,
82                                             size_t in_len, void* reply, size_t max,
83                                             size_t* out_actual) {
84     LTRACEF("ctx %p, op %u\n", ctx, op);
85 
86     BlockDevice* bd = static_cast<BlockDevice*>(ctx);
87 
88     switch (op) {
89     case IOCTL_BLOCK_GET_INFO: {
90         block_info_t* info = reinterpret_cast<block_info_t*>(reply);
91         if (max < sizeof(*info))
92             return ZX_ERR_BUFFER_TOO_SMALL;
93         bd->GetInfo(info);
94         *out_actual = sizeof(*info);
95         return ZX_OK;
96     }
97     default:
98         return ZX_ERR_NOT_SUPPORTED;
99     }
100 }
101 
virtio_block_unbind(void * ctx)102 void BlockDevice::virtio_block_unbind(void* ctx) {
103     BlockDevice* bd = static_cast<BlockDevice*>(ctx);
104     bd->Unbind();
105 }
106 
virtio_block_release(void * ctx)107 void BlockDevice::virtio_block_release(void* ctx) {
108     fbl::unique_ptr<BlockDevice> bd(static_cast<BlockDevice*>(ctx));
109     bd->Release();
110 }
111 
BlockDevice(zx_device_t * bus_device,zx::bti bti,fbl::unique_ptr<Backend> backend)112 BlockDevice::BlockDevice(zx_device_t* bus_device, zx::bti bti, fbl::unique_ptr<Backend> backend)
113     : Device(bus_device, std::move(bti), std::move(backend)) {
114     sync_completion_reset(&txn_signal_);
115     sync_completion_reset(&worker_signal_);
116 
117     memset(&blk_req_buf_, 0, sizeof(blk_req_buf_));
118 }
119 
Init()120 zx_status_t BlockDevice::Init() {
121     LTRACE_ENTRY;
122 
123     DeviceReset();
124     CopyDeviceConfig(&config_, sizeof(config_));
125 
126     // TODO(cja): The blk_size provided in the device configuration is only
127     // populated if a specific feature bit has been negotiated during
128     // initialization, otherwise it is 0, at least in Virtio 0.9.5. Use 512
129     // as a default as a stopgap for now until proper feature negotiation
130     // is supported.
131     if (config_.blk_size == 0)
132         config_.blk_size = 512;
133 
134     LTRACEF("capacity %#" PRIx64 "\n", config_.capacity);
135     LTRACEF("size_max %#x\n", config_.size_max);
136     LTRACEF("seg_max  %#x\n", config_.seg_max);
137     LTRACEF("blk_size %#x\n", config_.blk_size);
138 
139     DriverStatusAck();
140 
141     // TODO: Check features bits and ack/nak them
142 
143     // Allocate the main vring.
144     auto err = vring_.Init(0, ring_size);
145     if (err < 0) {
146         zxlogf(ERROR, "failed to allocate vring\n");
147         return err;
148     }
149 
150     // Allocate a queue of block requests.
151     size_t size = sizeof(virtio_blk_req_t) * blk_req_count + sizeof(uint8_t) * blk_req_count;
152 
153     zx_status_t status =
154         io_buffer_init(&blk_req_buf_, bti_.get(), size, IO_BUFFER_RW | IO_BUFFER_CONTIG);
155     if (status != ZX_OK) {
156         zxlogf(ERROR, "cannot alloc blk_req buffers %d\n", status);
157         return status;
158     }
159     auto cleanup = fbl::MakeAutoCall([this]() { io_buffer_release(&blk_req_buf_); });
160     blk_req_ = static_cast<virtio_blk_req_t*>(io_buffer_virt(&blk_req_buf_));
161 
162     LTRACEF("allocated blk request at %p, physical address %#" PRIxPTR "\n", blk_req_,
163             io_buffer_phys(&blk_req_buf_));
164 
165     // Responses are 32 words at the end of the allocated block.
166     blk_res_pa_ = io_buffer_phys(&blk_req_buf_) + sizeof(virtio_blk_req_t) * blk_req_count;
167     blk_res_ = (uint8_t*)((uintptr_t)blk_req_ + sizeof(virtio_blk_req_t) * blk_req_count);
168 
169     LTRACEF("allocated blk responses at %p, physical address %#" PRIxPTR "\n", blk_res_,
170             blk_res_pa_);
171 
172     StartIrqThread();
173     DriverStatusOk();
174 
175     auto thread_entry = [](void* ctx) {
176         auto bd = static_cast<BlockDevice*>(ctx);
177         bd->WorkerThread();
178         return ZX_OK;
179     };
180     int ret = thrd_create_with_name(&worker_thread_, thread_entry, this, "virtio-block-worker");
181     if (ret != thrd_success) {
182         return ZX_ERR_INTERNAL;
183     }
184 
185     // Initialize and publish the zx_device.
186     device_ops_.get_size = &virtio_block_get_size;
187     device_ops_.ioctl = &virtio_block_ioctl;
188     device_ops_.unbind = &virtio_block_unbind;
189     device_ops_.release = &virtio_block_release;
190 
191     block_ops_.query = &virtio_block_query;
192     block_ops_.queue = &virtio_block_queue;
193 
194     device_add_args_t args = {};
195     args.version = DEVICE_ADD_ARGS_VERSION;
196     args.name = "virtio-block";
197     args.ctx = this;
198     args.ops = &device_ops_;
199     args.proto_id = ZX_PROTOCOL_BLOCK_IMPL;
200     args.proto_ops = &block_ops_;
201 
202     status = device_add(bus_device_, &args, &device_);
203     if (status != ZX_OK) {
204         device_ = nullptr;
205         return status;
206     }
207 
208     cleanup.cancel();
209     return ZX_OK;
210 }
211 
Release()212 void BlockDevice::Release() {
213     thrd_join(worker_thread_, nullptr);
214     io_buffer_release(&blk_req_buf_);
215     Device::Release();
216 }
217 
Unbind()218 void BlockDevice::Unbind() {
219     worker_shutdown_.store(true);
220     sync_completion_signal(&worker_signal_);
221     sync_completion_signal(&txn_signal_);
222     Device::Unbind();
223 }
224 
IrqRingUpdate()225 void BlockDevice::IrqRingUpdate() {
226     LTRACE_ENTRY;
227 
228     // Parse our descriptor chain and add back to the free queue.
229     auto free_chain = [this](vring_used_elem* used_elem) {
230         uint32_t i = (uint16_t)used_elem->id;
231         struct vring_desc* desc = vring_.DescFromIndex((uint16_t)i);
232         auto head_desc = desc; // Save the first element.
233         {
234             fbl::AutoLock lock(&ring_lock_);
235             for (;;) {
236                 int next;
237                 LTRACE_DO(virtio_dump_desc(desc));
238                 if (desc->flags & VRING_DESC_F_NEXT) {
239                     next = desc->next;
240                 } else {
241                     // End of chain.
242                     next = -1;
243                 }
244 
245                 vring_.FreeDesc((uint16_t)i);
246 
247                 if (next < 0)
248                     break;
249                 i = next;
250                 desc = vring_.DescFromIndex((uint16_t)i);
251             }
252         }
253 
254         bool need_complete = false;
255         block_txn_t* txn = nullptr;
256         {
257             fbl::AutoLock lock(&txn_lock_);
258 
259             // Search our pending txn list to see if this completes it.
260             list_for_every_entry(&pending_txn_list_, txn, block_txn_t, node) {
261                 if (txn->desc == head_desc) {
262                     LTRACEF("completes txn %p\n", txn);
263                     free_blk_req(txn->index);
264                     list_delete(&txn->node);
265 
266                     // We will do this outside of the lock.
267                     need_complete = true;
268 
269                     sync_completion_signal(&txn_signal_);
270                     break;
271                 }
272             }
273         }
274 
275         if (need_complete) {
276             txn_complete(txn, ZX_OK);
277         }
278     };
279 
280     // Tell the ring to find free chains and hand it back to our lambda.
281     vring_.IrqRingUpdate(free_chain);
282 }
283 
IrqConfigChange()284 void BlockDevice::IrqConfigChange() {
285     LTRACE_ENTRY;
286 }
287 
QueueTxn(block_txn_t * txn,uint32_t type,size_t bytes,zx_paddr_t * pages,size_t pagecount,uint16_t * idx)288 zx_status_t BlockDevice::QueueTxn(block_txn_t* txn, uint32_t type, size_t bytes, zx_paddr_t* pages,
289                                   size_t pagecount, uint16_t* idx) {
290     size_t index;
291     {
292         fbl::AutoLock lock(&txn_lock_);
293         index = alloc_blk_req();
294         if (index >= blk_req_count) {
295             LTRACEF("too many block requests queued (%zu)!\n", index);
296             return ZX_ERR_NO_RESOURCES;
297         }
298     }
299 
300     auto req = &blk_req_[index];
301     req->type = type;
302     req->ioprio = 0;
303     if (type == VIRTIO_BLK_T_FLUSH) {
304         req->sector = 0;
305     } else {
306         req->sector = txn->op.rw.offset_dev;
307     }
308     LTRACEF("blk_req type %u ioprio %u sector %" PRIu64 "\n", req->type, req->ioprio, req->sector);
309 
310     // Save the request index so we can free it when we complete the transfer.
311     txn->index = index;
312 
313     LTRACEF("page count %lu\n", pagecount);
314 
315     // Put together a transfer.
316     uint16_t i;
317     vring_desc* desc;
318     {
319         fbl::AutoLock lock(&ring_lock_);
320         desc = vring_.AllocDescChain((uint16_t)(2u + pagecount), &i);
321     }
322     if (!desc) {
323         LTRACEF("failed to allocate descriptor chain of length %zu\n", 2u + pagecount);
324         fbl::AutoLock lock(&txn_lock_);
325         free_blk_req(index);
326         return ZX_ERR_NO_RESOURCES;
327     }
328 
329     LTRACEF("after alloc chain desc %p, i %u\n", desc, i);
330 
331     // Point the txn at this head descriptor.
332     txn->desc = desc;
333 
334     // Set up the descriptor pointing to the head.
335     desc->addr = io_buffer_phys(&blk_req_buf_) + index * sizeof(virtio_blk_req_t);
336     desc->len = sizeof(virtio_blk_req_t);
337     desc->flags = VRING_DESC_F_NEXT;
338     LTRACE_DO(virtio_dump_desc(desc));
339 
340     for (size_t n = 0; n < pagecount; n++) {
341         desc = vring_.DescFromIndex(desc->next);
342         desc->addr = pages[n];
343         desc->len = (uint32_t)((bytes > PAGE_SIZE) ? PAGE_SIZE : bytes);
344         if (n == 0) {
345             // First entry may not be page aligned.
346             size_t page0_offset = txn->op.rw.offset_vmo & PAGE_MASK;
347 
348             // Adjust starting address.
349             desc->addr += page0_offset;
350 
351             // Trim length if necessary.
352             size_t max = PAGE_SIZE - page0_offset;
353             if (desc->len > max) {
354                 desc->len = (uint32_t)max;
355             }
356         }
357         desc->flags = VRING_DESC_F_NEXT;
358         LTRACEF("pa %#lx, len %#x\n", desc->addr, desc->len);
359 
360         // Mark buffer as write-only if its a block read.
361         if (type == VIRTIO_BLK_T_IN) {
362             desc->flags |= VRING_DESC_F_WRITE;
363         }
364 
365         bytes -= desc->len;
366     }
367     LTRACE_DO(virtio_dump_desc(desc));
368     assert(bytes == 0);
369 
370     // Set up the descriptor pointing to the response.
371     desc = vring_.DescFromIndex(desc->next);
372     desc->addr = blk_res_pa_ + index;
373     desc->len = 1;
374     desc->flags = VRING_DESC_F_WRITE;
375     LTRACE_DO(virtio_dump_desc(desc));
376 
377     *idx = i;
378     return ZX_OK;
379 }
380 
pin_pages(zx_handle_t bti,block_txn_t * txn,size_t bytes,zx_paddr_t * pages,size_t * num_pages)381 static zx_status_t pin_pages(zx_handle_t bti, block_txn_t* txn, size_t bytes, zx_paddr_t* pages,
382                              size_t* num_pages) {
383     uint64_t suboffset = txn->op.rw.offset_vmo & PAGE_MASK;
384     uint64_t aligned_offset = txn->op.rw.offset_vmo & ~PAGE_MASK;
385     size_t pin_size = ROUNDUP(suboffset + bytes, PAGE_SIZE);
386     *num_pages = pin_size / PAGE_SIZE;
387     if (*num_pages > MAX_SCATTER) {
388         TRACEF("virtio: transaction too large\n");
389         return ZX_ERR_INVALID_ARGS;
390     }
391 
392     zx_handle_t vmo = txn->op.rw.vmo;
393     zx_status_t status;
394     if ((status = zx_bti_pin(bti, ZX_BTI_PERM_READ | ZX_BTI_PERM_WRITE, vmo, aligned_offset,
395                              pin_size, pages, *num_pages, &txn->pmt)) != ZX_OK) {
396         TRACEF("virtio: could not pin pages %d\n", status);
397         return ZX_ERR_INTERNAL;
398     }
399 
400     pages[0] += suboffset;
401     return ZX_OK;
402 }
403 
SignalWorker(block_txn_t * txn)404 void BlockDevice::SignalWorker(block_txn_t* txn) {
405     switch (txn->op.command & BLOCK_OP_MASK) {
406     case BLOCK_OP_READ:
407     case BLOCK_OP_WRITE:
408         // Transaction must fit within device.
409         if ((txn->op.rw.offset_dev >= config_.capacity) ||
410             (config_.capacity - txn->op.rw.offset_dev < txn->op.rw.length)) {
411             LTRACEF("request beyond the end of the device!\n");
412             txn_complete(txn, ZX_ERR_OUT_OF_RANGE);
413             return;
414         }
415 
416         if (txn->op.rw.length == 0) {
417             txn_complete(txn, ZX_OK);
418             return;
419         }
420         LTRACEF("txn %p, command %#x\n", txn, txn->op.command);
421         break;
422     case BLOCK_OP_FLUSH:
423         LTRACEF("txn %p, command FLUSH\n", txn);
424         break;
425     default:
426         txn_complete(txn, ZX_ERR_NOT_SUPPORTED);
427         return;
428     }
429 
430     fbl::AutoLock lock(&lock_);
431     if (worker_shutdown_.load()) {
432         txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
433         return;
434     }
435     list_add_tail(&worker_txn_list_, &txn->node);
436     sync_completion_signal(&worker_signal_);
437 }
438 
WorkerThread()439 void BlockDevice::WorkerThread() {
440     auto cleanup = fbl::MakeAutoCall([this]() { CleanupPendingTxns(); });
441     block_txn_t* txn = nullptr;
442     for (;;) {
443         if (worker_shutdown_.load()) {
444             return;
445         }
446 
447         // Pull a txn off the list or wait to be signaled.
448         {
449             fbl::AutoLock lock(&lock_);
450             txn = list_remove_head_type(&worker_txn_list_, block_txn_t, node);
451         }
452         if (!txn) {
453             sync_completion_wait(&worker_signal_, ZX_TIME_INFINITE);
454             sync_completion_reset(&worker_signal_);
455             continue;
456         }
457 
458         LTRACEF("WorkerThread handling txn %p\n", txn);
459 
460         uint32_t type;
461         bool do_flush = false;
462         size_t bytes;
463         zx_paddr_t pages[MAX_SCATTER];
464         size_t num_pages;
465         zx_status_t status = ZX_OK;
466 
467         if ((txn->op.command & BLOCK_OP_MASK) == BLOCK_OP_FLUSH) {
468             type = VIRTIO_BLK_T_FLUSH;
469             bytes = 0;
470             num_pages = 0;
471             do_flush = true;
472         } else {
473             if ((txn->op.command & BLOCK_OP_MASK) == BLOCK_OP_WRITE) {
474                 type = VIRTIO_BLK_T_OUT;
475             } else {
476                 type = VIRTIO_BLK_T_IN;
477             }
478             txn->op.rw.offset_vmo *= config_.blk_size;
479             bytes = txn->op.rw.length * config_.blk_size;
480             status = pin_pages(bti_.get(), txn, bytes, pages, &num_pages);
481         }
482 
483         if (status != ZX_OK) {
484             txn_complete(txn, status);
485             continue;
486         }
487 
488         // A flush operation should complete after any inflight transactions, so wait for all
489         // pending txns to complete before submitting a flush txn. This is necessary because
490         // a virtio block device may service requests in any order.
491         if (do_flush) {
492             FlushPendingTxns();
493             if (worker_shutdown_.load()) {
494                 return;
495             }
496         }
497 
498         bool cannot_fail = false;
499         for (;;) {
500             uint16_t idx;
501             status = QueueTxn(txn, type, bytes, pages, num_pages, &idx);
502             if (status == ZX_OK) {
503                 fbl::AutoLock lock(&txn_lock_);
504                 list_add_tail(&pending_txn_list_, &txn->node);
505                 vring_.SubmitChain(idx);
506                 vring_.Kick();
507                 LTRACEF("WorkerThread submitted txn %p\n", txn);
508                 break;
509             }
510 
511             if (cannot_fail) {
512                 TRACEF("virtio-block: failed to queue txn to hw: %d\n", status);
513                 {
514                     fbl::AutoLock lock(&txn_lock_);
515                     free_blk_req(txn->index);
516                 }
517                 txn_complete(txn, status);
518                 break;
519             }
520 
521             {
522                 fbl::AutoLock lock(&txn_lock_);
523                 if (list_is_empty(&pending_txn_list_)) {
524                     // We hold the txn lock and the list is empty, if we fail this time around
525                     // there's no point in trying again.
526                     cannot_fail = true;
527                     continue;
528                 }
529 
530                 // Reset the txn signal then wait for one of the pending txns to complete
531                 // outside the lock. This should mean that resources have been freed for the next
532                 // iteration. We cannot deadlock due to the reset because pending_txn_list_ is not
533                 // empty.
534                 sync_completion_reset(&txn_signal_);
535             }
536 
537             sync_completion_wait(&txn_signal_, ZX_TIME_INFINITE);
538             if (worker_shutdown_.load()) {
539                 return;
540             }
541         }
542 
543         // A flush operation should complete before any subsequent transactions. So, we wait for all
544         // pending transactions (including the flush) to complete before continuing.
545         if (do_flush) {
546             FlushPendingTxns();
547         }
548     }
549 }
550 
FlushPendingTxns()551 void BlockDevice::FlushPendingTxns() {
552     for (;;) {
553         {
554             fbl::AutoLock lock(&txn_lock_);
555             if (list_is_empty(&pending_txn_list_)) {
556                 return;
557             }
558             sync_completion_reset(&txn_signal_);
559         }
560         sync_completion_wait(&txn_signal_, ZX_TIME_INFINITE);
561         if (worker_shutdown_.load()) {
562             return;
563         }
564     }
565 }
566 
CleanupPendingTxns()567 void BlockDevice::CleanupPendingTxns() {
568     // Virtio specification 3.3.1 Driver Requirements: Device Cleanup
569     // A driver MUST ensure a virtqueue isn’t live (by device reset) before removing exposed
570     // buffers.
571     DeviceReset();
572     block_txn_t* txn = nullptr;
573     block_txn_t* temp_entry = nullptr;
574     {
575         fbl::AutoLock lock(&lock_);
576         list_for_every_entry_safe(&worker_txn_list_, txn, temp_entry, block_txn_t, node) {
577             list_delete(&txn->node);
578             txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
579         }
580     }
581     fbl::AutoLock lock(&txn_lock_);
582     list_for_every_entry_safe(&pending_txn_list_, txn, temp_entry, block_txn_t, node) {
583         free_blk_req(txn->index);
584         list_delete(&txn->node);
585         txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
586     }
587 }
588 
589 } // namespace virtio
590