/*- * Copyright (c) 2013 Peter Grehan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dm.h" #include "block_if.h" #include "ahci.h" #include "dm_string.h" #include "log.h" #include "iothread.h" /* * Notes: * The F_OFD_SETLK support is introduced in glibc 2.20. * The glibc version on target board is above 2.20. * The following code temporarily fixes up building issues on Ubuntu 14.04, * where the glibc version is 2.19 by default. * Theoretically we should use cross-compiling tool to compile applications. */ #ifndef F_OFD_SETLK #define F_OFD_SETLK 37 #endif #define BLOCKIF_SIG 0xb109b109 #define BLOCKIF_NUMTHR 8 #define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) #define MAX_DISCARD_SEGMENT 256 #define AIO_MODE_THREAD_POOL 0 #define AIO_MODE_IO_URING 1 /* the max number of entries for the io_uring submission/completion queue */ #define MAX_IO_URING_ENTRIES 256 /* * Debug printf */ static int block_if_debug; #define DPRINTF(params) do { if (block_if_debug) pr_dbg params; } while (0) #define WPRINTF(params) (pr_err params) enum blockop { BOP_READ, BOP_WRITE, BOP_FLUSH, BOP_DISCARD }; enum blockstat { BST_FREE, BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE }; struct blockif_elem { TAILQ_ENTRY(blockif_elem) link; struct blockif_req *req; enum blockop op; enum blockstat status; pthread_t tid; off_t block; }; struct blockif_queue { int closing; pthread_t btid[BLOCKIF_NUMTHR]; pthread_mutex_t mtx; pthread_cond_t cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) freeq; TAILQ_HEAD(, blockif_elem) pendq; TAILQ_HEAD(, blockif_elem) busyq; struct blockif_elem reqs[BLOCKIF_MAXREQ]; int in_flight; struct io_uring ring; struct iothread_mevent iomvt; struct iothread_ctx *ioctx; struct blockif_ctxt *bc; }; struct blockif_ops { int aio_mode; int (*init)(struct blockif_queue *, char *); void (*deinit)(struct blockif_queue *); void (*mutex_lock)(pthread_mutex_t *); void (*mutex_unlock)(pthread_mutex_t *); void (*request)(struct blockif_queue *); }; struct blockif_ctxt { int fd; int isblk; int candiscard; int rdonly; off_t size; int sub_file_assign; off_t sub_file_start_lba; struct flock fl; int sectsz; int psectsz; int psectoff; int max_discard_sectors; int max_discard_seg; int discard_sector_alignment; struct blockif_queue *bqs; int bq_num; int aio_mode; const struct blockif_ops *ops; /* write cache enable */ uint8_t wce; /* whether bypass the Service VM's page cache or not */ uint8_t bypass_host_cache; /* * whether enable BST_BLOCK logic in blockif_dequeue/blockif_complete or not. * * If the BST_BLOCK logic is enabled, following check would be done: * if the current request is consecutive to any request in penq or busyq, * current request's status is set to BST_BLOCK. Then, this request is blocked until the prior request, * which blocks it, is completed. * It indicates that consecutive requests are executed sequentially. */ uint8_t bst_block; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; struct blockif_sig_elem { pthread_mutex_t mtx; pthread_cond_t cond; int pending; struct blockif_sig_elem *next; }; struct discard_range { uint64_t sector; uint32_t num_sectors; uint32_t flags; }; static struct blockif_sig_elem *blockif_bse_head; static int blockif_flush_cache(struct blockif_ctxt *bc) { int err; err = 0; if (!bc->wce) { if (fsync(bc->fd)) err = errno; } return err; } static int blockif_enqueue(struct blockif_queue *bq, struct blockif_req *breq, enum blockop op) { struct blockif_elem *be, *tbe; off_t off; int i; be = TAILQ_FIRST(&bq->freeq); if (be == NULL || be->status != BST_FREE) { WPRINTF(("%s: failed to get element from freeq\n", __func__)); return 0; } TAILQ_REMOVE(&bq->freeq, be, link); be->req = breq; be->op = op; be->status = BST_PEND; if (bq->bc->bst_block == 1) { switch (op) { case BOP_READ: case BOP_WRITE: case BOP_DISCARD: off = breq->offset; for (i = 0; i < breq->iovcnt; i++) off += breq->iov[i].iov_len; break; default: /* off = OFF_MAX; */ off = 1 << (sizeof(off_t) - 1); } be->block = off; TAILQ_FOREACH(tbe, &bq->pendq, link) { if (tbe->block == breq->offset) break; } if (tbe == NULL) { TAILQ_FOREACH(tbe, &bq->busyq, link) { if (tbe->block == breq->offset) break; } } if (tbe != NULL) be->status = BST_BLOCK; } TAILQ_INSERT_TAIL(&bq->pendq, be, link); return (be->status == BST_PEND); } static int blockif_dequeue(struct blockif_queue *bq, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; TAILQ_FOREACH(be, &bq->pendq, link) { if (be->status == BST_PEND) break; } if (be == NULL) return 0; TAILQ_REMOVE(&bq->pendq, be, link); be->status = BST_BUSY; be->tid = t; TAILQ_INSERT_TAIL(&bq->busyq, be, link); *bep = be; return 1; } static void blockif_complete(struct blockif_queue *bq, struct blockif_elem *be) { struct blockif_elem *tbe; if (be->status == BST_DONE || be->status == BST_BUSY) TAILQ_REMOVE(&bq->busyq, be, link); else TAILQ_REMOVE(&bq->pendq, be, link); if (bq->bc->bst_block == 1) { TAILQ_FOREACH(tbe, &bq->pendq, link) { if (tbe->req->offset == be->block) tbe->status = BST_PEND; } } be->tid = 0; be->status = BST_FREE; be->req = NULL; TAILQ_INSERT_TAIL(&bq->freeq, be, link); } static int discard_range_validate(struct blockif_ctxt *bc, off_t start, off_t size) { off_t start_sector = start / DEV_BSIZE; off_t size_sector = size / DEV_BSIZE; if (!size || (start + size) > (bc->size + bc->sub_file_start_lba)) return -1; if ((size_sector > bc->max_discard_sectors) || (bc->discard_sector_alignment && start_sector % bc->discard_sector_alignment)) return -1; return 0; } static int blockif_process_discard(struct blockif_ctxt *bc, struct blockif_req *br) { int err; struct discard_range *range; int n_range, i, segment; off_t arg[MAX_DISCARD_SEGMENT][2]; err = 0; n_range = 0; segment = 0; if (!bc->candiscard) return EOPNOTSUPP; if (bc->rdonly) return EROFS; if (br->iovcnt == 1) { /* virtio-blk use iov to transfer discard range */ n_range = br->iov[0].iov_len/sizeof(*range); range = br->iov[0].iov_base; for (i = 0; i < n_range; i++) { arg[i][0] = range[i].sector * DEV_BSIZE + bc->sub_file_start_lba; arg[i][1] = range[i].num_sectors * DEV_BSIZE; segment++; if (segment > bc->max_discard_seg) { WPRINTF(("segment > max_discard_seg\n")); return EINVAL; } if (discard_range_validate(bc, arg[i][0], arg[i][1])) { WPRINTF(("range [%ld: %ld] is invalid\n", arg[i][0], arg[i][1])); return EINVAL; } } } else { /* ahci parse discard range to br->offset and br->reside */ arg[0][0] = br->offset + bc->sub_file_start_lba; arg[0][1] = br->resid; segment = 1; } for (i = 0; i < segment; i++) { if (bc->isblk) { err = ioctl(bc->fd, BLKDISCARD, arg[i]); } else { /* FALLOC_FL_PUNCH_HOLE: * Deallocates space in the byte range starting at offset and * continuing for length bytes. After a successful call, * subsequent reads from this range will return zeroes. * FALLOC_FL_KEEP_SIZE: * Do not modify the apparent length of the file. */ err = fallocate(bc->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, arg[i][0], arg[i][1]); if (!err) err = fdatasync(bc->fd); } if (err) { WPRINTF(("Failed to discard offset=%ld nbytes=%ld err code: %d\n", arg[i][0], arg[i][1], err)); return err; } } br->resid = 0; return 0; } static void blockif_init_iov_align_info(struct blockif_req *br) { int i, size; struct br_align_info *info = &br->align_info; size = 0; info->is_iov_base_aligned = true; info->is_iov_len_aligned = true; for (i = 0; i < br->iovcnt; i++) { size += br->iov[i].iov_len; if ((uint64_t)(br->iov[i].iov_base) % info->alignment) { info->is_iov_base_aligned = false; } if (br->iov[i].iov_len % info->alignment) { info->is_iov_len_aligned = false; } } info->org_size = size; return; } /* only for debug purpose */ static void blockif_dump_align_info(struct blockif_req *br) { struct br_align_info *info = &br->align_info; int i; if (!info->is_offset_aligned) { DPRINTF(("%s: Misaligned offset 0x%llx \n\r", __func__, (info->aligned_dn_start + info->head))); } /* iov info */ if (!info->is_iov_base_aligned) { DPRINTF(("%s: Misaligned iov_base \n\r", __func__)); } if (!info->is_iov_len_aligned) { DPRINTF(("%s: Misaligned iov_len \n\r", __func__)); } DPRINTF(("%s: alignment %d, br->iovcnt %d \n\r", __func__, info->alignment, br->iovcnt)); for (i = 0; i < br->iovcnt; i++) { DPRINTF(("%s: iov[%d].iov_base 0x%llx (remainder %d), iov[%d].iov_len %d (remainder %d) \n\r", __func__, i, (uint64_t)(br->iov[i].iov_base), (uint64_t)(br->iov[i].iov_base) % info->alignment, i, br->iov[i].iov_len, (br->iov[i].iov_len) % info->alignment)); } /* overall info */ DPRINTF(("%s: head %d, tail %d, org_size %d, bounced_size %d, aligned_dn_start 0x%lx aligned_dn_end 0x%lx \n\r", __func__, info->head, info->tail, info->org_size, info->bounced_size, info->aligned_dn_start, info->aligned_dn_end)); } /* * |<------------------------------------- bounced_size --------------------------------->| * |<-------- alignment ------->| |<-------- alignment ------->| * |<--- head --->|<------------------------ org_size ---------------------->|<-- tail -->| * | | | | | | * *--------------$-------------*----------- ... ------------*---------------$------------* * | | | | | | * | start end | * aligned_dn_start aligned_dn_end * |__________head_area_________| |__________tail_area_________| * |<--- head --->| | |<-- end_rmd -->|<-- tail -->| * |<-------- alignment ------->| |<-------- alignment ------->| * * * Original access area: * - start = br->offset + bc->sub_file_start_lba * - org_size = SUM of org_iov[i].iov_len * - end = start + org_size * * * Head area to be bounced: * - head = start % alignment * - aligned_dn_start = start - head * head | head_area * -------------|------------- * 0 | not exist * non-zero | exist * * * Tail area to be bounced: * - end_rmd = end % alignment * - aligned_dn_end = end - end_rmd * end_rmd | tail | tail_area * -------------|-----------------------|------------------ * 0 | 0 | not exist * non-zero | alignment - end_rmd | exist * * * Overall bounced area: * - bounced_size = head + org_size + tail * * * Use a single bounce_iov to do the aligned READ/WRITE. * - bounce_iov cnt = 1 * - bounce_iov.iov_base = return of posix_memalign (aligned to @alignment) * - bounce_iov.len = bounced_size * - Accessing from the offset `aligned_dn_start` * * * For READ access: * 1. Do the aligned READ (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. * 2. AFTER the aligned READ is completed, copy the data from the bounce_iov to the org_iov. * from | length * ------------------------------|--------------- * bounce_iov.iov_base + head | org_size * * * For WRITE access: * 1. BEFORE the aligned WRITE is conducted, construct the bounced data with three parts in bounce_iov. * (a). If head is not 0, get data of first alignment area -> head_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_start | alignment * * (b). If tail is not 0, get data of last alignment area -> tail_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_end | alignment * * (c). Construct the bounced data in bounce_iov * from | to | length | source * --------------------|------------------|---------------|--------------------------------- * aligned_dn_start | start | head | head_area data from block device * start | end | org_size | data specified in org_iov[] * end | end + tail | tail | tail_area data from block device * 2. Do the aligned WRITE (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. * * */ static void blockif_init_alignment_info(struct blockif_ctxt *bc, struct blockif_req *br) { struct br_align_info *info = &br->align_info; uint32_t alignment = bc->sectsz; uint32_t end_rmd; off_t start, end; bool all_aligned; /* If O_DIRECT flag is not used, does NOT need to initialize the alignment info. */ if (!bc->bypass_host_cache) { info->need_conversion = false; return; } start = br->offset + bc->sub_file_start_lba; info->is_offset_aligned = (!(start % alignment)); info->alignment = alignment; blockif_init_iov_align_info(br); all_aligned = (info->is_offset_aligned && info->is_iov_base_aligned && info->is_iov_len_aligned); /* * If O_DIRECT flag is used and the request is aligned, * does NOT need to initialize the alignment info further. */ if (all_aligned) { info->need_conversion = false; return; } info->need_conversion = true; /* head area */ info->head = start % alignment; info->aligned_dn_start = start - info->head; /* tail area */ end = start + info->org_size; end_rmd = (end % alignment); info->tail = (end_rmd == 0) ? (0) : (alignment - end_rmd); info->aligned_dn_end = end - end_rmd; /* overall bounced area */ info->bounced_size = info->head + info->org_size + info->tail; /* only for debug purpose */ blockif_dump_align_info(br); return; } /* * Use a single bounce_iov to do the aligned READ/WRITE. * - bounce_iov cnt = 1 * - bounce_iov.iov_base = return of posix_memalign (aligned to @alignment) * - bounce_iov.len = bounced_size * - Accessing from the offset `aligned_dn_start` */ static int blockif_init_bounce_iov(struct blockif_req *br) { int ret = 0; void *bounce_buf = NULL; struct br_align_info *info = &br->align_info; ret = posix_memalign(&bounce_buf, info->alignment, info->bounced_size); if (ret != 0) { bounce_buf = NULL; pr_err("%s: posix_memalign fails, error %s \n", __func__, strerror(-ret)); } else { info->bounce_iov.iov_base = bounce_buf; info->bounce_iov.iov_len = info->bounced_size; } return ret; } static void blockif_deinit_bounce_iov(struct blockif_req *br) { struct br_align_info *info = &br->align_info; if (info->bounce_iov.iov_base == NULL) { pr_err("%s: info->bounce_iov.iov_base is NULL %s \n", __func__); return; } free(info->bounce_iov.iov_base); info->bounce_iov.iov_base = NULL; } /* * For READ access: * 1. Do the aligned READ (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. * 2. AFTER the aligned READ is completed, copy the data from the bounce_iov to the org_iov. * from | length * ------------------------------|--------------- * bounce_iov.iov_base + head | org_size */ static void blockif_complete_bounced_read(struct blockif_req *br) { struct iovec *iov = br->iov; struct br_align_info *info = &br->align_info; int length = info->org_size; int i, len, done; if (info->bounce_iov.iov_base == NULL) { pr_err("%s: info->bounce_iov.iov_base is NULL %s \n", __func__); return; } done = info->head; for (i = 0; i < br->iovcnt; i++) { len = (iov[i].iov_len < length) ? iov[i].iov_len : length; memcpy(iov[i].iov_base, info->bounce_iov.iov_base + done, len); done += len; length -= len; if (length <= 0) break; } return; }; /* * It is used to read out the head/tail area to construct the bounced data. * * Allocate an aligned buffer for @b_iov and do an aligned read from @offset (with length @alignment). * @offset shall be guaranteed to be aligned by caller (either aligned_dn_start or aligned_dn_end). */ static int blockif_read_head_or_tail_area(int fd, struct iovec *b_iov, off_t offset, uint32_t alignment) { int ret = 0; int bytes_read; void *area = NULL; ret = posix_memalign(&area, alignment, alignment); if (ret != 0) { area = NULL; pr_err("%s: posix_memalign fails, error %s \n", __func__, strerror(-ret)); return ret; } b_iov->iov_base = area; b_iov->iov_len = alignment; bytes_read = preadv(fd, b_iov, 1, offset); if (bytes_read < 0) { pr_err("%s: read fails \n", __func__); ret = errno; } return ret; } /* * For WRITE access: * 1. BEFORE the aligned WRITE is conducted, construct the bounced data with three parts in bounce_iov. * (a). If head is not 0, get data of first alignment area -> head_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_start | alignment * * (b). If tail is not 0, get data of last alignment area -> tail_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_end | alignment * * (c). Construct the bounced data in bounce_iov * from | to | length | source * --------------------|------------------|---------------|--------------------------------- * aligned_dn_start | start | head | head_area data from block device * start | end | org_size | data specified in org_iov[] * end | end + tail | tail | tail_area data from block device * 2. Do the aligned WRITE (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. */ static int blockif_init_bounced_write(struct blockif_ctxt *bc, struct blockif_req *br) { struct iovec *iov = br->iov; struct br_align_info *info = &br->align_info; uint32_t alignment = info->alignment; struct iovec head_iov, tail_iov; uint32_t head = info->head; uint32_t tail = info->tail; int i, done, ret; ret = 0; if (info->bounce_iov.iov_base == NULL) { pr_err("%s: info->bounce_iov.iov_base is NULL \n", __func__); return -1; } memset(&head_iov, 0, sizeof(head_iov)); memset(&tail_iov, 0, sizeof(tail_iov)); /* * If head is not 0, get data of first alignment area, head_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_start | alignment */ if (head != 0) { ret = blockif_read_head_or_tail_area(bc->fd, &head_iov, info->aligned_dn_start, alignment); if (ret < 0) { pr_err("%s: fails to read out the head area \n", __func__); goto end; } } /* * If tail is not 0, get data of last alignment area, tail_area data (by doing aligned read) * from | length * --------------------|--------------- * aligned_dn_end | alignment */ if (tail != 0) { ret = blockif_read_head_or_tail_area(bc->fd, &tail_iov, info->aligned_dn_end, alignment); if (ret < 0) { pr_err("%s: fails to read out the tail area \n", __func__); goto end; } } done = 0; /* * Construct the bounced data in bounce_iov * from | to | length | source * --------------------|------------------|---------------|--------------------------------- * aligned_dn_start | start | head | head_area data from block device * start | end | org_size | data specified in org_iov[] * end | end + tail | tail | tail_area data from block device */ if (head_iov.iov_base != NULL) { memcpy(info->bounce_iov.iov_base, head_iov.iov_base, head); done += head; } /* data specified in org_iov[] */ for (i = 0; i < br->iovcnt; i++) { memcpy(info->bounce_iov.iov_base + done, iov[i].iov_base, iov[i].iov_len); done += iov[i].iov_len; } if (tail_iov.iov_base != NULL) { memcpy(info->bounce_iov.iov_base + done, tail_iov.iov_base + alignment - tail, tail); done += tail; } end: if (head_iov.iov_base != NULL) { free(head_iov.iov_base); } if (tail_iov.iov_base != NULL) { free(tail_iov.iov_base); } return ret; }; static void blockif_proc(struct blockif_queue *bq, struct blockif_elem *be) { struct blockif_req *br; struct blockif_ctxt *bc; struct br_align_info *info; ssize_t len, iovcnt; struct iovec *iovecs; off_t offset; int err; br = be->req; bc = bq->bc; info = &br->align_info; err = 0; if ((be->op == BOP_READ) || (be->op == BOP_WRITE)) { if (info->need_conversion) { /* bounce_iov has been initialized in blockif_request */ iovecs = &(info->bounce_iov); iovcnt = 1; offset = info->aligned_dn_start; } else { /* use the original iov if no conversion is required */ iovecs = br->iov; iovcnt = br->iovcnt; offset = br->offset + bc->sub_file_start_lba; } } switch (be->op) { case BOP_READ: len = preadv(bc->fd, iovecs, iovcnt, offset); if (info->need_conversion) { blockif_complete_bounced_read(br); blockif_deinit_bounce_iov(br); } if (len < 0) err = errno; else br->resid -= len; break; case BOP_WRITE: if (bc->rdonly) { err = EROFS; break; } len = pwritev(bc->fd, iovecs, iovcnt, offset); if (info->need_conversion) { blockif_deinit_bounce_iov(br); } if (len < 0) err = errno; else { br->resid -= len; err = blockif_flush_cache(bc); } break; case BOP_FLUSH: if (fsync(bc->fd)) err = errno; break; case BOP_DISCARD: err = blockif_process_discard(bc, br); break; default: err = EINVAL; break; } be->status = BST_DONE; (*br->callback)(br, err); } static void * blockif_thr(void *arg) { struct blockif_queue *bq; struct blockif_elem *be; pthread_t t; bq = arg; t = pthread_self(); pthread_mutex_lock(&bq->mtx); for (;;) { while (blockif_dequeue(bq, t, &be)) { pthread_mutex_unlock(&bq->mtx); blockif_proc(bq, be); pthread_mutex_lock(&bq->mtx); blockif_complete(bq, be); } /* Check ctxt status here to see if exit requested */ if (bq->closing) break; pthread_cond_wait(&bq->cond, &bq->mtx); } pthread_mutex_unlock(&bq->mtx); pthread_exit(NULL); return NULL; } static void blockif_sigcont_handler(int signal) { struct blockif_sig_elem *bse; WPRINTF(("block_if sigcont handler!\n")); for (;;) { /* * Process the entire list even if not intended for * this thread. */ do { bse = blockif_bse_head; if (bse == NULL) return; } while (!__sync_bool_compare_and_swap( (uintptr_t *)&blockif_bse_head, (uintptr_t)bse, (uintptr_t)bse->next)); pthread_mutex_lock(&bse->mtx); bse->pending = 0; pthread_cond_signal(&bse->cond); pthread_mutex_unlock(&bse->mtx); } } static void blockif_init(void) { signal(SIGCONT, blockif_sigcont_handler); } /* * This function checks if the sub file range, specified by sub_start and * sub_size, has any overlap with other sub file ranges with write access. */ static int sub_file_validate(struct blockif_ctxt *bc, int fd, int read_only, off_t sub_start, off_t sub_size) { struct flock *fl = &bc->fl; memset(fl, 0, sizeof(struct flock)); fl->l_whence = SEEK_SET; /* offset base is start of file */ if (read_only) fl->l_type = F_RDLCK; else fl->l_type = F_WRLCK; fl->l_start = sub_start; fl->l_len = sub_size; /* use "open file description locks" to validate */ if (fcntl(fd, F_OFD_SETLK, fl) == -1) { DPRINTF(("failed to lock subfile!\n")); return -1; } /* Keep file lock on to prevent other sub files, until DM exits */ return 0; } void sub_file_unlock(struct blockif_ctxt *bc) { struct flock *fl; if (bc->sub_file_assign) { fl = &bc->fl; DPRINTF(("blockif: release file lock...\n")); fl->l_type = F_UNLCK; if (fcntl(bc->fd, F_OFD_SETLK, fl) == -1) { pr_err("blockif: failed to unlock subfile!\n"); exit(1); } DPRINTF(("blockif: release done\n")); } } static int thread_pool_init(struct blockif_queue *bq, char *tag) { int i; char tname[MAXCOMLEN + 1]; for (i = 0; i < BLOCKIF_NUMTHR; i++) { if (snprintf(tname, sizeof(tname), "%s-%d", tag, i) >= sizeof(tname)) { pr_err("blk thread name too long"); } pthread_create(&bq->btid[i], NULL, blockif_thr, bq); pthread_setname_np(bq->btid[i], tname); } return 0; } static void thread_pool_deinit(struct blockif_queue *bq) { int i; void *jval; for (i = 0; i < BLOCKIF_NUMTHR; i++) pthread_join(bq->btid[i], &jval); } static inline void thread_pool_mutex_lock(pthread_mutex_t *mutex) { pthread_mutex_lock(mutex); } static inline void thread_pool_mutex_unlock(pthread_mutex_t *mutex) { pthread_mutex_unlock(mutex); } static void thread_pool_request(struct blockif_queue *bq) { pthread_cond_signal(&bq->cond); } static struct blockif_ops blockif_ops_thread_pool = { .aio_mode = AIO_MODE_THREAD_POOL, .init = thread_pool_init, .deinit = thread_pool_deinit, .mutex_lock = thread_pool_mutex_lock, .mutex_unlock = thread_pool_mutex_unlock, .request = thread_pool_request, }; static bool is_io_uring_supported_op(enum blockop op) { return ((op == BOP_READ) || (op == BOP_WRITE) || (op == BOP_FLUSH)); } static int iou_submit_sqe(struct blockif_queue *bq, struct blockif_elem *be) { int ret; struct io_uring *ring = &bq->ring; struct io_uring_sqe *sqes = io_uring_get_sqe(ring); struct blockif_req *br = be->req; struct blockif_ctxt *bc = bq->bc; struct br_align_info *info = &br->align_info; struct iovec *iovecs; size_t iovcnt; off_t offset; if (!sqes) { pr_err("%s: io_uring_get_sqe fails. NO available submission queue entry. \n", __func__); return -1; } if ((be->op == BOP_READ) || (be->op == BOP_WRITE)) { if (info->need_conversion) { /* bounce_iov has been initialized in blockif_request */ iovecs = &(info->bounce_iov); iovcnt = 1; offset = info->aligned_dn_start; } else { /* use the original iov if no conversion is required */ iovecs = br->iov; iovcnt = br->iovcnt; offset = br->offset + bc->sub_file_start_lba; } } switch (be->op) { case BOP_READ: io_uring_prep_readv(sqes, bc->fd, iovecs, iovcnt, offset); break; case BOP_WRITE: io_uring_prep_writev(sqes, bc->fd, iovecs, iovcnt, offset); break; case BOP_FLUSH: io_uring_prep_fsync(sqes, bc->fd, IORING_FSYNC_DATASYNC); break; default: /* is_io_uring_supported_op guarantees that this case will not occur */ break; } io_uring_sqe_set_data(sqes, be); bq->in_flight++; ret = io_uring_submit(ring); if (ret < 0) { pr_err("%s: io_uring_submit fails, error %s \n", __func__, strerror(-ret)); } return ret; } static void iou_submit(struct blockif_queue *bq) { int err = 0; struct blockif_elem *be; struct blockif_req *br; struct blockif_ctxt *bc = bq->bc; while (blockif_dequeue(bq, 0, &be)) { if (is_io_uring_supported_op(be->op)) { err = iou_submit_sqe(bq, be); /* * -1 means that there is NO available submission queue entry (SQE) in the submission queue. * Break the while loop here. Request can only be submitted when SQE is available. */ if (err == -1) { break; } } else { br = be->req; if (be->op == BOP_DISCARD) { err = blockif_process_discard(bc, br); } else { pr_err("%s: op %d is not supported \n", __func__, be->op); err = EINVAL; } be->status = BST_DONE; (*br->callback)(br, err); blockif_complete(bq, be); } } return; } static void iou_process_completions(struct blockif_queue *bq) { struct io_uring_cqe *cqes = NULL; struct blockif_elem *be; struct blockif_req *br; struct io_uring *ring = &bq->ring; int err = 0; while (io_uring_peek_cqe(ring, &cqes) == 0) { if (!cqes) { pr_err("%s: cqes is NULL \n", __func__); break; } be = io_uring_cqe_get_data(cqes); bq->in_flight--; io_uring_cqe_seen(ring, cqes); cqes = NULL; if (!be) { pr_err("%s: be is NULL \n", __func__); break; } br = be->req; if (!br) { pr_err("%s: br is NULL \n", __func__); break; } /* when a misaligned request is converted to an aligned one, need to do some post-work */ if (br->align_info.need_conversion) { if (be->op == BOP_READ) { blockif_complete_bounced_read(br); } blockif_deinit_bounce_iov(br); } if (be->op == BOP_WRITE) { err = blockif_flush_cache(bq->bc); } be->status = BST_DONE; (*br->callback)(br, err); blockif_complete(bq, be); } return; } static void iou_submit_and_reap(struct blockif_queue *bq) { iou_submit(bq); if (bq->in_flight > 0) { iou_process_completions(bq); } return; } static void iou_reap_and_submit(struct blockif_queue *bq) { iou_process_completions(bq); if (!TAILQ_EMPTY(&bq->pendq)) { iou_submit(bq); } return; } static void iou_completion_cb(void *arg) { struct blockif_queue *bq = (struct blockif_queue *)arg; iou_reap_and_submit(bq); } static int iou_set_iothread(struct blockif_queue *bq) { int fd = bq->ring.ring_fd; int ret = 0; bq->iomvt.arg = bq; bq->iomvt.run = iou_completion_cb; bq->iomvt.fd = fd; ret = iothread_add(bq->ioctx, fd, &bq->iomvt); if (ret < 0) { pr_err("%s: iothread_add fails, error %d \n", __func__, ret); } return ret; } static int iou_del_iothread(struct blockif_queue *bq) { int fd = bq->ring.ring_fd; int ret = 0; ret = iothread_del(bq->ioctx, fd); if (ret < 0) { pr_err("%s: iothread_del fails, error %d \n", __func__, ret); } return ret; } static int iou_init(struct blockif_queue *bq, char *tag __attribute__((unused))) { int ret = 0; struct io_uring *ring = &bq->ring; /* * - When Service VM owns more dedicated cores, IORING_SETUP_SQPOLL and IORING_SETUP_IOPOLL, along with NVMe * polling mechanism could benefit the performance. * - When Service VM owns limited cores, the benefit of polling is also limited. * As in most of the use cases, Service VM does not own much dedicated cores, IORING_SETUP_SQPOLL and * IORING_SETUP_IOPOLL are not enabled by default. */ ret = io_uring_queue_init(MAX_IO_URING_ENTRIES, ring, 0); if (ret < 0) { pr_err("%s: io_uring_queue_init fails, error %d \n", __func__, ret); } else { ret = iou_set_iothread(bq); if (ret < 0) { pr_err("%s: iou_set_iothread fails \n", __func__); } } return ret; } static void iou_deinit(struct blockif_queue *bq) { struct io_uring *ring = &bq->ring; iou_del_iothread(bq); io_uring_queue_exit(ring); } static inline void iou_mutex_lock(pthread_mutex_t *mutex __attribute__((unused))) {} static inline void iou_mutex_unlock(pthread_mutex_t *mutex __attribute__((unused))) {} static struct blockif_ops blockif_ops_iou = { .aio_mode = AIO_MODE_IO_URING, .init = iou_init, .deinit = iou_deinit, .mutex_lock = iou_mutex_lock, .mutex_unlock = iou_mutex_unlock, .request = iou_submit_and_reap, }; struct blockif_ctxt * blockif_open(const char *optstr, const char *ident, int queue_num, struct iothreads_info *iothrds_info) { char tag[MAXCOMLEN + 1]; char *nopt, *xopts, *cp; struct blockif_ctxt *bc = NULL; struct stat sbuf; /* struct diocgattr_arg arg; */ off_t size, psectsz, psectoff; int fd, i, j, sectsz; int writeback, ro, candiscard, ssopt, pssopt; long sz; long long b; int err_code = -1; off_t sub_file_start_lba, sub_file_size; int sub_file_assign; int max_discard_sectors, max_discard_seg, discard_sector_alignment; off_t probe_arg[] = {0, 0}; int aio_mode; int bypass_host_cache, open_flag, bst_block; pthread_once(&blockif_once, blockif_init); fd = -1; ssopt = 0; pssopt = 0; ro = 0; sub_file_assign = 0; sub_file_start_lba = 0; sub_file_size = 0; max_discard_sectors = -1; max_discard_seg = -1; discard_sector_alignment = -1; /* default mode is thread pool */ aio_mode = AIO_MODE_THREAD_POOL; /* writethru is on by default */ writeback = 0; /* By default, do NOT bypass Service VM's page cache. */ bypass_host_cache = 0; /* By default, bst_block is 1, meaning that the BST_BLOCK logic in blockif_dequeue is enabled. */ bst_block = 1; candiscard = 0; if (queue_num <= 0) queue_num = 1; /* * The first element in the optstring is always a pathname. * Optional elements follow */ nopt = xopts = strdup(optstr); if (!nopt) { WPRINTF(("block_if.c: strdup retruns NULL\n")); return NULL; } while (xopts != NULL) { cp = strsep(&xopts, ","); if (cp == nopt) /* file or device pathname */ continue; else if (!strcmp(cp, "writeback")) writeback = 1; else if (!strcmp(cp, "writethru")) writeback = 0; else if (!strcmp(cp, "ro")) ro = 1; else if (!strcmp(cp, "nocache")) bypass_host_cache = 1; else if (!strcmp(cp, "no_bst_block")) bst_block = 0; else if (!strncmp(cp, "discard", strlen("discard"))) { strsep(&cp, "="); if (cp != NULL) { if (!(!dm_strtoi(cp, &cp, 10, &max_discard_sectors) && *cp == ':' && !dm_strtoi(cp + 1, &cp, 10, &max_discard_seg) && *cp == ':' && !dm_strtoi(cp + 1, &cp, 10, &discard_sector_alignment))) goto err; } candiscard = 1; } else if (!strncmp(cp, "sectorsize", strlen("sectorsize"))) { /* * sectorsize= * or * sectorsize=/ */ if (strsep(&cp, "=") && !dm_strtoi(cp, &cp, 10, &ssopt)) { pssopt = ssopt; if (*cp == '/' && dm_strtoi(cp + 1, &cp, 10, &pssopt) < 0) goto err; } else { goto err; } } else if (!strncmp(cp, "range", strlen("range"))) { /* range=/ */ if (strsep(&cp, "=") && !dm_strtol(cp, &cp, 10, &sub_file_start_lba) && *cp == '/' && !dm_strtol(cp + 1, &cp, 10, &sub_file_size)) sub_file_assign = 1; else goto err; } else if (!strncmp(cp, "aio", strlen("aio"))) { /* aio=threads or aio=io_uring */ strsep(&cp, "="); if (cp != NULL) { if (!strncmp(cp, "threads", strlen("threads"))) { aio_mode = AIO_MODE_THREAD_POOL; } else if (!strncmp(cp, "io_uring", strlen("io_uring"))) { aio_mode = AIO_MODE_IO_URING; } else { pr_err("Invalid aio option, only support threads or io_uring \"%s\"\n", cp); goto err; } } } else { pr_err("Invalid device option \"%s\"\n", cp); goto err; } } /* * To support "writeback" and "writethru" mode switch during runtime, * O_SYNC is not used directly, as O_SYNC flag cannot dynamic change * after file is opened. Instead, we call fsync() after each write * operation to emulate it. */ open_flag = (ro ? O_RDONLY : O_RDWR); if (bypass_host_cache == 1) { open_flag |= O_DIRECT; } fd = open(nopt, open_flag); if (fd < 0 && !ro) { /* Attempt a r/w fail with a r/o open */ fd = open(nopt, O_RDONLY); ro = 1; } if (fd < 0) { pr_err("Could not open backing file: %s", nopt); goto err; } if (fstat(fd, &sbuf) < 0) { pr_err("Could not stat backing file %s", nopt); goto err; } /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; psectsz = psectoff = 0; if (S_ISBLK(sbuf.st_mode)) { /* get size */ err_code = ioctl(fd, BLKGETSIZE, &sz); if (err_code) { pr_err("error %d getting block size!\n", err_code); size = sbuf.st_size; /* set default value */ } else { size = sz * DEV_BSIZE; /* DEV_BSIZE is 512 on Linux */ } if (!err_code || err_code == EFBIG) { err_code = ioctl(fd, BLKGETSIZE64, &b); if (err_code || b == 0 || b == sz) size = b * DEV_BSIZE; else size = b; } DPRINTF(("block partition size is 0x%lx\n", size)); /* get sector size, 512 on Linux */ sectsz = DEV_BSIZE; DPRINTF(("block partition sector size is 0x%x\n", sectsz)); /* get physical sector size */ err_code = ioctl(fd, BLKPBSZGET, &psectsz); if (err_code) { pr_err("error %d getting physical sectsz!\n", err_code); psectsz = DEV_BSIZE; /* set default physical size */ } DPRINTF(("block partition physical sector size is 0x%lx\n", psectsz)); if (candiscard) { err_code = ioctl(fd, BLKDISCARD, probe_arg); if (err_code) { WPRINTF(("not support DISCARD\n")); candiscard = 0; } } } else { if (size < DEV_BSIZE || (size & (DEV_BSIZE - 1))) { WPRINTF(("%s size not corret, should be multiple of %d\n", nopt, DEV_BSIZE)); goto err; } psectsz = sbuf.st_blksize; } if (ssopt != 0) { if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || ssopt > pssopt) { pr_err("Invalid sector size %d/%d\n", ssopt, pssopt); goto err; } /* * Some backend drivers (e.g. cd0, ada0) require that the I/O * size be a multiple of the device's sector size. * * Validate that the emulated sector size complies with this * requirement. */ if (S_ISCHR(sbuf.st_mode)) { if (ssopt < sectsz || (ssopt % sectsz) != 0) { pr_err("Sector size %d incompatible with underlying device sector size %d\n", ssopt, sectsz); goto err; } } sectsz = ssopt; psectsz = pssopt; psectoff = 0; } bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { pr_err("calloc"); goto err; } if (sub_file_assign) { DPRINTF(("sector size is %d\n", sectsz)); bc->sub_file_assign = 1; bc->sub_file_start_lba = sub_file_start_lba * sectsz; size = sub_file_size * sectsz; DPRINTF(("Validating sub file...\n")); err_code = sub_file_validate(bc, fd, ro, bc->sub_file_start_lba, size); if (err_code < 0) { pr_err("subfile range specified not valid!\n"); exit(1); } DPRINTF(("Validated done!\n")); } else { /* normal case */ bc->sub_file_assign = 0; bc->sub_file_start_lba = 0; } bc->fd = fd; bc->isblk = S_ISBLK(sbuf.st_mode); bc->candiscard = candiscard; if (candiscard) { bc->max_discard_sectors = (max_discard_sectors != -1) ? max_discard_sectors : (size / DEV_BSIZE); bc->max_discard_seg = (max_discard_seg != -1) ? max_discard_seg : 1; bc->discard_sector_alignment = (discard_sector_alignment != -1) ? discard_sector_alignment : 0; } bc->rdonly = ro; bc->size = size; bc->sectsz = sectsz; bc->psectsz = psectsz; bc->psectoff = psectoff; bc->wce = writeback; bc->bypass_host_cache = bypass_host_cache; bc->aio_mode = aio_mode; if (bc->aio_mode == AIO_MODE_IO_URING) { bc->ops = &blockif_ops_iou; bc->bst_block = 0; } else { bc->ops = &blockif_ops_thread_pool; bc->bst_block = bst_block; } bc->bq_num = queue_num; bc->bqs = calloc(bc->bq_num, sizeof(struct blockif_queue)); if (bc->bqs == NULL) { pr_err("calloc bqs"); goto err; } for (j = 0; j < bc->bq_num; j++) { struct blockif_queue *bq = bc->bqs + j; bq->bc = bc; if ((iothrds_info != NULL) && (iothrds_info->ioctx_base != NULL) && (iothrds_info->num != 0)) { bq->ioctx = iothrds_info->ioctx_base + j % iothrds_info->num; } else { bq->ioctx = NULL; } pthread_mutex_init(&bq->mtx, NULL); pthread_cond_init(&bq->cond, NULL); TAILQ_INIT(&bq->freeq); TAILQ_INIT(&bq->pendq); TAILQ_INIT(&bq->busyq); for (i = 0; i < BLOCKIF_MAXREQ; i++) { bq->reqs[i].status = BST_FREE; TAILQ_INSERT_HEAD(&bq->freeq, &bq->reqs[i], link); } if (snprintf(tag, sizeof(tag), "blk-%s-%d", ident, j) >= sizeof(tag)) { pr_err("blk thread tag too long"); } if (bc->ops->init) { if (bc->ops->init(bq, tag) < 0) { goto err; } } } /* free strdup memory */ if (nopt) { free(nopt); nopt = NULL; } return bc; err: /* handle failure case: free strdup memory*/ if (nopt) free(nopt); if (fd >= 0) close(fd); if (bc) { if (bc->bqs) free(bc->bqs); free(bc); } return NULL; } static int blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { struct blockif_queue *bq; int err; err = 0; if (breq->qidx >= bc->bq_num) { pr_err("%s: invalid qidx %d\n", __func__, breq->qidx); return ENOENT; } bq = bc->bqs + breq->qidx; blockif_init_alignment_info(bc, breq); /* For misaligned READ/WRITE, need a bounce_iov to convert the misaligned request to an aligned one. */ if (((op == BOP_READ) || (op == BOP_WRITE)) && (breq->align_info.need_conversion)) { err = blockif_init_bounce_iov(breq); if (err < 0) { return err; } if (op == BOP_WRITE) { err = blockif_init_bounced_write(bc, breq); if (err < 0) { return err; } } } if (bc->ops->mutex_lock) { bc->ops->mutex_lock(&bq->mtx); } if (!TAILQ_EMPTY(&bq->freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ if (blockif_enqueue(bq, breq, op)) { if (bc->ops->request) { bc->ops->request(bq); } } } else { /* * Callers are not allowed to enqueue more than * the specified blockif queue limit. Return an * error to indicate that the queue length has been * exceeded. */ err = E2BIG; } if (bc->ops->mutex_unlock) { bc->ops->mutex_unlock(&bq->mtx); } return err; } int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) { return blockif_request(bc, breq, BOP_READ); } int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) { return blockif_request(bc, breq, BOP_WRITE); } int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) { return blockif_request(bc, breq, BOP_FLUSH); } int blockif_discard(struct blockif_ctxt *bc, struct blockif_req *breq) { return blockif_request(bc, breq, BOP_DISCARD); } int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; struct blockif_queue *bq; if (breq->qidx >= bc->bq_num) { pr_err("%s: invalid qidx %d\n", __func__, breq->qidx); return ENOENT; } bq = bc->bqs + breq->qidx; pthread_mutex_lock(&bq->mtx); /* * Check pending requests. */ TAILQ_FOREACH(be, &bq->pendq, link) { if (be->req == breq) break; } if (be != NULL) { /* * Found it. */ blockif_complete(bq, be); pthread_mutex_unlock(&bq->mtx); return 0; } /* * Check in-flight requests. */ TAILQ_FOREACH(be, &bq->busyq, link) { if (be->req == breq) break; } if (be == NULL) { /* * Didn't find it. */ pthread_mutex_unlock(&bq->mtx); return -1; } /* * Interrupt the processing thread to force it return * prematurely via it's normal callback path. */ while (be->status == BST_BUSY) { struct blockif_sig_elem bse, *old_head; pthread_mutex_init(&bse.mtx, NULL); pthread_cond_init(&bse.cond, NULL); bse.pending = 1; do { old_head = blockif_bse_head; bse.next = old_head; } while (!__sync_bool_compare_and_swap((uintptr_t *)& blockif_bse_head, (uintptr_t)old_head, (uintptr_t)&bse)); pthread_kill(be->tid, SIGCONT); pthread_mutex_lock(&bse.mtx); while (bse.pending) pthread_cond_wait(&bse.cond, &bse.mtx); pthread_mutex_unlock(&bse.mtx); } pthread_mutex_unlock(&bq->mtx); /* * The processing thread has been interrupted. Since it's not * clear if the callback has been invoked yet, return EBUSY. */ return -EBUSY; } int blockif_close(struct blockif_ctxt *bc) { int j; sub_file_unlock(bc); /* * Stop the block i/o thread */ for (j = 0; j < bc->bq_num; j++) { struct blockif_queue *bq = bc->bqs + j; pthread_mutex_lock(&bq->mtx); bq->closing = 1; pthread_cond_broadcast(&bq->cond); pthread_mutex_unlock(&bq->mtx); if (bc->ops->deinit) { bc->ops->deinit(bq); } } /* XXX Cancel queued i/o's ??? */ /* * Release resources */ close(bc->fd); if (bc->bqs) free(bc->bqs); free(bc); return 0; } /* * Return virtual C/H/S values for a given block. Use the algorithm * outlined in the VHD specification to calculate values. */ void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) { off_t sectors; /* total sectors of the block dev */ off_t hcyl; /* cylinders times heads */ uint16_t secpt; /* sectors per track */ uint8_t heads; sectors = bc->size / bc->sectsz; /* Clamp the size to the largest possible with CHS */ if (sectors > 65535UL*16*255) sectors = 65535UL*16*255; if (sectors >= 65536UL*16*63) { secpt = 255; heads = 16; hcyl = sectors / secpt; } else { secpt = 17; hcyl = sectors / secpt; heads = (hcyl + 1023) / 1024; if (heads < 4) heads = 4; if (hcyl >= (heads * 1024) || heads > 16) { secpt = 31; heads = 16; hcyl = sectors / secpt; } if (hcyl >= (heads * 1024)) { secpt = 63; heads = 16; hcyl = sectors / secpt; } } *c = hcyl / heads; *h = heads; *s = secpt; } /* * Accessors */ off_t blockif_size(struct blockif_ctxt *bc) { return bc->size; } int blockif_sectsz(struct blockif_ctxt *bc) { return bc->sectsz; } void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) { *size = bc->psectsz; *off = bc->psectoff; } int blockif_queuesz(struct blockif_ctxt *bc) { return (BLOCKIF_MAXREQ - 1); } int blockif_is_ro(struct blockif_ctxt *bc) { return bc->rdonly; } int blockif_candiscard(struct blockif_ctxt *bc) { return bc->candiscard; } int blockif_max_discard_sectors(struct blockif_ctxt *bc) { return bc->max_discard_sectors; } int blockif_max_discard_seg(struct blockif_ctxt *bc) { return bc->max_discard_seg; } int blockif_discard_sector_alignment(struct blockif_ctxt *bc) { return bc->discard_sector_alignment; } uint8_t blockif_get_wce(struct blockif_ctxt *bc) { return bc->wce; } void blockif_set_wce(struct blockif_ctxt *bc, uint8_t wce) { bc->wce = wce; } int blockif_flush_all(struct blockif_ctxt *bc) { int err; err=0; if (fsync(bc->fd)) err = errno; return err; }