1 // Copyright 2016 The Fuchsia Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "block.h"
6
7 #include <ddk/debug.h>
8 #include <fbl/algorithm.h>
9 #include <fbl/auto_call.h>
10 #include <fbl/auto_lock.h>
11 #include <inttypes.h>
12 #include <pretty/hexdump.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/param.h>
17 #include <zircon/compiler.h>
18
19 #include <utility>
20
21 #include "trace.h"
22
23 #define LOCAL_TRACE 0
24
25 // 1MB max transfer (unless further restricted by ring size).
26 #define MAX_SCATTER 257
27 #define MAX_MAX_XFER ((MAX_SCATTER - 1) * PAGE_SIZE)
28
29 #define PAGE_MASK (PAGE_SIZE - 1)
30
31 namespace virtio {
32
txn_complete(block_txn_t * txn,zx_status_t status)33 void BlockDevice::txn_complete(block_txn_t* txn, zx_status_t status) {
34 if (txn->pmt != ZX_HANDLE_INVALID) {
35 zx_pmt_unpin(txn->pmt);
36 txn->pmt = ZX_HANDLE_INVALID;
37 }
38 txn->completion_cb(txn->cookie, status, &txn->op);
39 }
40
41 // DDK level ops
42
43 // Optional: return the size (in bytes) of the readable/writable space of the device. Will default
44 // to 0 (non-seekable) if this is unimplemented.
virtio_block_get_size(void * ctx)45 zx_off_t BlockDevice::virtio_block_get_size(void* ctx) {
46 LTRACEF("ctx %p\n", ctx);
47
48 BlockDevice* bd = static_cast<BlockDevice*>(ctx);
49
50 return bd->GetSize();
51 }
52
GetInfo(block_info_t * info)53 void BlockDevice::GetInfo(block_info_t* info) {
54 memset(info, 0, sizeof(*info));
55 info->block_size = GetBlockSize();
56 info->block_count = GetSize() / GetBlockSize();
57 info->max_transfer_size = (uint32_t)(PAGE_SIZE * (ring_size - 2));
58
59 // Limit max transfer to our worst case scatter list size.
60 if (info->max_transfer_size > MAX_MAX_XFER) {
61 info->max_transfer_size = MAX_MAX_XFER;
62 }
63 }
64
virtio_block_query(void * ctx,block_info_t * info,size_t * bopsz)65 void BlockDevice::virtio_block_query(void* ctx, block_info_t* info, size_t* bopsz) {
66 BlockDevice* bd = static_cast<BlockDevice*>(ctx);
67 bd->GetInfo(info);
68 *bopsz = sizeof(block_txn_t);
69 }
70
virtio_block_queue(void * ctx,block_op_t * bop,block_impl_queue_callback completion_cb,void * cookie)71 void BlockDevice::virtio_block_queue(void* ctx, block_op_t* bop,
72 block_impl_queue_callback completion_cb, void* cookie) {
73 BlockDevice* bd = static_cast<BlockDevice*>(ctx);
74 block_txn_t* txn = static_cast<block_txn_t*>((void*)bop);
75 txn->pmt = ZX_HANDLE_INVALID;
76 txn->completion_cb = completion_cb;
77 txn->cookie = cookie;
78 bd->SignalWorker(txn);
79 }
80
virtio_block_ioctl(void * ctx,uint32_t op,const void * in_buf,size_t in_len,void * reply,size_t max,size_t * out_actual)81 zx_status_t BlockDevice::virtio_block_ioctl(void* ctx, uint32_t op, const void* in_buf,
82 size_t in_len, void* reply, size_t max,
83 size_t* out_actual) {
84 LTRACEF("ctx %p, op %u\n", ctx, op);
85
86 BlockDevice* bd = static_cast<BlockDevice*>(ctx);
87
88 switch (op) {
89 case IOCTL_BLOCK_GET_INFO: {
90 block_info_t* info = reinterpret_cast<block_info_t*>(reply);
91 if (max < sizeof(*info))
92 return ZX_ERR_BUFFER_TOO_SMALL;
93 bd->GetInfo(info);
94 *out_actual = sizeof(*info);
95 return ZX_OK;
96 }
97 default:
98 return ZX_ERR_NOT_SUPPORTED;
99 }
100 }
101
virtio_block_unbind(void * ctx)102 void BlockDevice::virtio_block_unbind(void* ctx) {
103 BlockDevice* bd = static_cast<BlockDevice*>(ctx);
104 bd->Unbind();
105 }
106
virtio_block_release(void * ctx)107 void BlockDevice::virtio_block_release(void* ctx) {
108 fbl::unique_ptr<BlockDevice> bd(static_cast<BlockDevice*>(ctx));
109 bd->Release();
110 }
111
BlockDevice(zx_device_t * bus_device,zx::bti bti,fbl::unique_ptr<Backend> backend)112 BlockDevice::BlockDevice(zx_device_t* bus_device, zx::bti bti, fbl::unique_ptr<Backend> backend)
113 : Device(bus_device, std::move(bti), std::move(backend)) {
114 sync_completion_reset(&txn_signal_);
115 sync_completion_reset(&worker_signal_);
116
117 memset(&blk_req_buf_, 0, sizeof(blk_req_buf_));
118 }
119
Init()120 zx_status_t BlockDevice::Init() {
121 LTRACE_ENTRY;
122
123 DeviceReset();
124 CopyDeviceConfig(&config_, sizeof(config_));
125
126 // TODO(cja): The blk_size provided in the device configuration is only
127 // populated if a specific feature bit has been negotiated during
128 // initialization, otherwise it is 0, at least in Virtio 0.9.5. Use 512
129 // as a default as a stopgap for now until proper feature negotiation
130 // is supported.
131 if (config_.blk_size == 0)
132 config_.blk_size = 512;
133
134 LTRACEF("capacity %#" PRIx64 "\n", config_.capacity);
135 LTRACEF("size_max %#x\n", config_.size_max);
136 LTRACEF("seg_max %#x\n", config_.seg_max);
137 LTRACEF("blk_size %#x\n", config_.blk_size);
138
139 DriverStatusAck();
140
141 // TODO: Check features bits and ack/nak them
142
143 // Allocate the main vring.
144 auto err = vring_.Init(0, ring_size);
145 if (err < 0) {
146 zxlogf(ERROR, "failed to allocate vring\n");
147 return err;
148 }
149
150 // Allocate a queue of block requests.
151 size_t size = sizeof(virtio_blk_req_t) * blk_req_count + sizeof(uint8_t) * blk_req_count;
152
153 zx_status_t status =
154 io_buffer_init(&blk_req_buf_, bti_.get(), size, IO_BUFFER_RW | IO_BUFFER_CONTIG);
155 if (status != ZX_OK) {
156 zxlogf(ERROR, "cannot alloc blk_req buffers %d\n", status);
157 return status;
158 }
159 auto cleanup = fbl::MakeAutoCall([this]() { io_buffer_release(&blk_req_buf_); });
160 blk_req_ = static_cast<virtio_blk_req_t*>(io_buffer_virt(&blk_req_buf_));
161
162 LTRACEF("allocated blk request at %p, physical address %#" PRIxPTR "\n", blk_req_,
163 io_buffer_phys(&blk_req_buf_));
164
165 // Responses are 32 words at the end of the allocated block.
166 blk_res_pa_ = io_buffer_phys(&blk_req_buf_) + sizeof(virtio_blk_req_t) * blk_req_count;
167 blk_res_ = (uint8_t*)((uintptr_t)blk_req_ + sizeof(virtio_blk_req_t) * blk_req_count);
168
169 LTRACEF("allocated blk responses at %p, physical address %#" PRIxPTR "\n", blk_res_,
170 blk_res_pa_);
171
172 StartIrqThread();
173 DriverStatusOk();
174
175 auto thread_entry = [](void* ctx) {
176 auto bd = static_cast<BlockDevice*>(ctx);
177 bd->WorkerThread();
178 return ZX_OK;
179 };
180 int ret = thrd_create_with_name(&worker_thread_, thread_entry, this, "virtio-block-worker");
181 if (ret != thrd_success) {
182 return ZX_ERR_INTERNAL;
183 }
184
185 // Initialize and publish the zx_device.
186 device_ops_.get_size = &virtio_block_get_size;
187 device_ops_.ioctl = &virtio_block_ioctl;
188 device_ops_.unbind = &virtio_block_unbind;
189 device_ops_.release = &virtio_block_release;
190
191 block_ops_.query = &virtio_block_query;
192 block_ops_.queue = &virtio_block_queue;
193
194 device_add_args_t args = {};
195 args.version = DEVICE_ADD_ARGS_VERSION;
196 args.name = "virtio-block";
197 args.ctx = this;
198 args.ops = &device_ops_;
199 args.proto_id = ZX_PROTOCOL_BLOCK_IMPL;
200 args.proto_ops = &block_ops_;
201
202 status = device_add(bus_device_, &args, &device_);
203 if (status != ZX_OK) {
204 device_ = nullptr;
205 return status;
206 }
207
208 cleanup.cancel();
209 return ZX_OK;
210 }
211
Release()212 void BlockDevice::Release() {
213 thrd_join(worker_thread_, nullptr);
214 io_buffer_release(&blk_req_buf_);
215 Device::Release();
216 }
217
Unbind()218 void BlockDevice::Unbind() {
219 worker_shutdown_.store(true);
220 sync_completion_signal(&worker_signal_);
221 sync_completion_signal(&txn_signal_);
222 Device::Unbind();
223 }
224
IrqRingUpdate()225 void BlockDevice::IrqRingUpdate() {
226 LTRACE_ENTRY;
227
228 // Parse our descriptor chain and add back to the free queue.
229 auto free_chain = [this](vring_used_elem* used_elem) {
230 uint32_t i = (uint16_t)used_elem->id;
231 struct vring_desc* desc = vring_.DescFromIndex((uint16_t)i);
232 auto head_desc = desc; // Save the first element.
233 {
234 fbl::AutoLock lock(&ring_lock_);
235 for (;;) {
236 int next;
237 LTRACE_DO(virtio_dump_desc(desc));
238 if (desc->flags & VRING_DESC_F_NEXT) {
239 next = desc->next;
240 } else {
241 // End of chain.
242 next = -1;
243 }
244
245 vring_.FreeDesc((uint16_t)i);
246
247 if (next < 0)
248 break;
249 i = next;
250 desc = vring_.DescFromIndex((uint16_t)i);
251 }
252 }
253
254 bool need_complete = false;
255 block_txn_t* txn = nullptr;
256 {
257 fbl::AutoLock lock(&txn_lock_);
258
259 // Search our pending txn list to see if this completes it.
260 list_for_every_entry(&pending_txn_list_, txn, block_txn_t, node) {
261 if (txn->desc == head_desc) {
262 LTRACEF("completes txn %p\n", txn);
263 free_blk_req(txn->index);
264 list_delete(&txn->node);
265
266 // We will do this outside of the lock.
267 need_complete = true;
268
269 sync_completion_signal(&txn_signal_);
270 break;
271 }
272 }
273 }
274
275 if (need_complete) {
276 txn_complete(txn, ZX_OK);
277 }
278 };
279
280 // Tell the ring to find free chains and hand it back to our lambda.
281 vring_.IrqRingUpdate(free_chain);
282 }
283
IrqConfigChange()284 void BlockDevice::IrqConfigChange() {
285 LTRACE_ENTRY;
286 }
287
QueueTxn(block_txn_t * txn,uint32_t type,size_t bytes,zx_paddr_t * pages,size_t pagecount,uint16_t * idx)288 zx_status_t BlockDevice::QueueTxn(block_txn_t* txn, uint32_t type, size_t bytes, zx_paddr_t* pages,
289 size_t pagecount, uint16_t* idx) {
290 size_t index;
291 {
292 fbl::AutoLock lock(&txn_lock_);
293 index = alloc_blk_req();
294 if (index >= blk_req_count) {
295 LTRACEF("too many block requests queued (%zu)!\n", index);
296 return ZX_ERR_NO_RESOURCES;
297 }
298 }
299
300 auto req = &blk_req_[index];
301 req->type = type;
302 req->ioprio = 0;
303 if (type == VIRTIO_BLK_T_FLUSH) {
304 req->sector = 0;
305 } else {
306 req->sector = txn->op.rw.offset_dev;
307 }
308 LTRACEF("blk_req type %u ioprio %u sector %" PRIu64 "\n", req->type, req->ioprio, req->sector);
309
310 // Save the request index so we can free it when we complete the transfer.
311 txn->index = index;
312
313 LTRACEF("page count %lu\n", pagecount);
314
315 // Put together a transfer.
316 uint16_t i;
317 vring_desc* desc;
318 {
319 fbl::AutoLock lock(&ring_lock_);
320 desc = vring_.AllocDescChain((uint16_t)(2u + pagecount), &i);
321 }
322 if (!desc) {
323 LTRACEF("failed to allocate descriptor chain of length %zu\n", 2u + pagecount);
324 fbl::AutoLock lock(&txn_lock_);
325 free_blk_req(index);
326 return ZX_ERR_NO_RESOURCES;
327 }
328
329 LTRACEF("after alloc chain desc %p, i %u\n", desc, i);
330
331 // Point the txn at this head descriptor.
332 txn->desc = desc;
333
334 // Set up the descriptor pointing to the head.
335 desc->addr = io_buffer_phys(&blk_req_buf_) + index * sizeof(virtio_blk_req_t);
336 desc->len = sizeof(virtio_blk_req_t);
337 desc->flags = VRING_DESC_F_NEXT;
338 LTRACE_DO(virtio_dump_desc(desc));
339
340 for (size_t n = 0; n < pagecount; n++) {
341 desc = vring_.DescFromIndex(desc->next);
342 desc->addr = pages[n];
343 desc->len = (uint32_t)((bytes > PAGE_SIZE) ? PAGE_SIZE : bytes);
344 if (n == 0) {
345 // First entry may not be page aligned.
346 size_t page0_offset = txn->op.rw.offset_vmo & PAGE_MASK;
347
348 // Adjust starting address.
349 desc->addr += page0_offset;
350
351 // Trim length if necessary.
352 size_t max = PAGE_SIZE - page0_offset;
353 if (desc->len > max) {
354 desc->len = (uint32_t)max;
355 }
356 }
357 desc->flags = VRING_DESC_F_NEXT;
358 LTRACEF("pa %#lx, len %#x\n", desc->addr, desc->len);
359
360 // Mark buffer as write-only if its a block read.
361 if (type == VIRTIO_BLK_T_IN) {
362 desc->flags |= VRING_DESC_F_WRITE;
363 }
364
365 bytes -= desc->len;
366 }
367 LTRACE_DO(virtio_dump_desc(desc));
368 assert(bytes == 0);
369
370 // Set up the descriptor pointing to the response.
371 desc = vring_.DescFromIndex(desc->next);
372 desc->addr = blk_res_pa_ + index;
373 desc->len = 1;
374 desc->flags = VRING_DESC_F_WRITE;
375 LTRACE_DO(virtio_dump_desc(desc));
376
377 *idx = i;
378 return ZX_OK;
379 }
380
pin_pages(zx_handle_t bti,block_txn_t * txn,size_t bytes,zx_paddr_t * pages,size_t * num_pages)381 static zx_status_t pin_pages(zx_handle_t bti, block_txn_t* txn, size_t bytes, zx_paddr_t* pages,
382 size_t* num_pages) {
383 uint64_t suboffset = txn->op.rw.offset_vmo & PAGE_MASK;
384 uint64_t aligned_offset = txn->op.rw.offset_vmo & ~PAGE_MASK;
385 size_t pin_size = ROUNDUP(suboffset + bytes, PAGE_SIZE);
386 *num_pages = pin_size / PAGE_SIZE;
387 if (*num_pages > MAX_SCATTER) {
388 TRACEF("virtio: transaction too large\n");
389 return ZX_ERR_INVALID_ARGS;
390 }
391
392 zx_handle_t vmo = txn->op.rw.vmo;
393 zx_status_t status;
394 if ((status = zx_bti_pin(bti, ZX_BTI_PERM_READ | ZX_BTI_PERM_WRITE, vmo, aligned_offset,
395 pin_size, pages, *num_pages, &txn->pmt)) != ZX_OK) {
396 TRACEF("virtio: could not pin pages %d\n", status);
397 return ZX_ERR_INTERNAL;
398 }
399
400 pages[0] += suboffset;
401 return ZX_OK;
402 }
403
SignalWorker(block_txn_t * txn)404 void BlockDevice::SignalWorker(block_txn_t* txn) {
405 switch (txn->op.command & BLOCK_OP_MASK) {
406 case BLOCK_OP_READ:
407 case BLOCK_OP_WRITE:
408 // Transaction must fit within device.
409 if ((txn->op.rw.offset_dev >= config_.capacity) ||
410 (config_.capacity - txn->op.rw.offset_dev < txn->op.rw.length)) {
411 LTRACEF("request beyond the end of the device!\n");
412 txn_complete(txn, ZX_ERR_OUT_OF_RANGE);
413 return;
414 }
415
416 if (txn->op.rw.length == 0) {
417 txn_complete(txn, ZX_OK);
418 return;
419 }
420 LTRACEF("txn %p, command %#x\n", txn, txn->op.command);
421 break;
422 case BLOCK_OP_FLUSH:
423 LTRACEF("txn %p, command FLUSH\n", txn);
424 break;
425 default:
426 txn_complete(txn, ZX_ERR_NOT_SUPPORTED);
427 return;
428 }
429
430 fbl::AutoLock lock(&lock_);
431 if (worker_shutdown_.load()) {
432 txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
433 return;
434 }
435 list_add_tail(&worker_txn_list_, &txn->node);
436 sync_completion_signal(&worker_signal_);
437 }
438
WorkerThread()439 void BlockDevice::WorkerThread() {
440 auto cleanup = fbl::MakeAutoCall([this]() { CleanupPendingTxns(); });
441 block_txn_t* txn = nullptr;
442 for (;;) {
443 if (worker_shutdown_.load()) {
444 return;
445 }
446
447 // Pull a txn off the list or wait to be signaled.
448 {
449 fbl::AutoLock lock(&lock_);
450 txn = list_remove_head_type(&worker_txn_list_, block_txn_t, node);
451 }
452 if (!txn) {
453 sync_completion_wait(&worker_signal_, ZX_TIME_INFINITE);
454 sync_completion_reset(&worker_signal_);
455 continue;
456 }
457
458 LTRACEF("WorkerThread handling txn %p\n", txn);
459
460 uint32_t type;
461 bool do_flush = false;
462 size_t bytes;
463 zx_paddr_t pages[MAX_SCATTER];
464 size_t num_pages;
465 zx_status_t status = ZX_OK;
466
467 if ((txn->op.command & BLOCK_OP_MASK) == BLOCK_OP_FLUSH) {
468 type = VIRTIO_BLK_T_FLUSH;
469 bytes = 0;
470 num_pages = 0;
471 do_flush = true;
472 } else {
473 if ((txn->op.command & BLOCK_OP_MASK) == BLOCK_OP_WRITE) {
474 type = VIRTIO_BLK_T_OUT;
475 } else {
476 type = VIRTIO_BLK_T_IN;
477 }
478 txn->op.rw.offset_vmo *= config_.blk_size;
479 bytes = txn->op.rw.length * config_.blk_size;
480 status = pin_pages(bti_.get(), txn, bytes, pages, &num_pages);
481 }
482
483 if (status != ZX_OK) {
484 txn_complete(txn, status);
485 continue;
486 }
487
488 // A flush operation should complete after any inflight transactions, so wait for all
489 // pending txns to complete before submitting a flush txn. This is necessary because
490 // a virtio block device may service requests in any order.
491 if (do_flush) {
492 FlushPendingTxns();
493 if (worker_shutdown_.load()) {
494 return;
495 }
496 }
497
498 bool cannot_fail = false;
499 for (;;) {
500 uint16_t idx;
501 status = QueueTxn(txn, type, bytes, pages, num_pages, &idx);
502 if (status == ZX_OK) {
503 fbl::AutoLock lock(&txn_lock_);
504 list_add_tail(&pending_txn_list_, &txn->node);
505 vring_.SubmitChain(idx);
506 vring_.Kick();
507 LTRACEF("WorkerThread submitted txn %p\n", txn);
508 break;
509 }
510
511 if (cannot_fail) {
512 TRACEF("virtio-block: failed to queue txn to hw: %d\n", status);
513 {
514 fbl::AutoLock lock(&txn_lock_);
515 free_blk_req(txn->index);
516 }
517 txn_complete(txn, status);
518 break;
519 }
520
521 {
522 fbl::AutoLock lock(&txn_lock_);
523 if (list_is_empty(&pending_txn_list_)) {
524 // We hold the txn lock and the list is empty, if we fail this time around
525 // there's no point in trying again.
526 cannot_fail = true;
527 continue;
528 }
529
530 // Reset the txn signal then wait for one of the pending txns to complete
531 // outside the lock. This should mean that resources have been freed for the next
532 // iteration. We cannot deadlock due to the reset because pending_txn_list_ is not
533 // empty.
534 sync_completion_reset(&txn_signal_);
535 }
536
537 sync_completion_wait(&txn_signal_, ZX_TIME_INFINITE);
538 if (worker_shutdown_.load()) {
539 return;
540 }
541 }
542
543 // A flush operation should complete before any subsequent transactions. So, we wait for all
544 // pending transactions (including the flush) to complete before continuing.
545 if (do_flush) {
546 FlushPendingTxns();
547 }
548 }
549 }
550
FlushPendingTxns()551 void BlockDevice::FlushPendingTxns() {
552 for (;;) {
553 {
554 fbl::AutoLock lock(&txn_lock_);
555 if (list_is_empty(&pending_txn_list_)) {
556 return;
557 }
558 sync_completion_reset(&txn_signal_);
559 }
560 sync_completion_wait(&txn_signal_, ZX_TIME_INFINITE);
561 if (worker_shutdown_.load()) {
562 return;
563 }
564 }
565 }
566
CleanupPendingTxns()567 void BlockDevice::CleanupPendingTxns() {
568 // Virtio specification 3.3.1 Driver Requirements: Device Cleanup
569 // A driver MUST ensure a virtqueue isn’t live (by device reset) before removing exposed
570 // buffers.
571 DeviceReset();
572 block_txn_t* txn = nullptr;
573 block_txn_t* temp_entry = nullptr;
574 {
575 fbl::AutoLock lock(&lock_);
576 list_for_every_entry_safe(&worker_txn_list_, txn, temp_entry, block_txn_t, node) {
577 list_delete(&txn->node);
578 txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
579 }
580 }
581 fbl::AutoLock lock(&txn_lock_);
582 list_for_every_entry_safe(&pending_txn_list_, txn, temp_entry, block_txn_t, node) {
583 free_blk_req(txn->index);
584 list_delete(&txn->node);
585 txn_complete(txn, ZX_ERR_IO_NOT_PRESENT);
586 }
587 }
588
589 } // namespace virtio
590