From 14c20fa31caf72e22e8f956cf3f5cf896561d8d7 Mon Sep 17 00:00:00 2001 From: Shiqing Gao Date: Mon, 16 Oct 2023 22:55:55 +0800 Subject: [PATCH] dm: block_if: support misaligned request when O_DIRECT is used Use of O_DIRECT flag could be a performance option. But this flag may impose alignment restrictions on the length and address of user-space buffers and the file offset of I/Os. To support the use of O_DIRECT flag in block_if, this patch adds the support to handle the misaligned request. - When O_DIRECT flag is used (`nocache` is specified in acrn-dm parameters), * if the original I/O request is aligned, the original I/O request is submitted directly. * if the original I/O request is not aligned (either due to the buffer address/length misalignment, or the offset misalignment), the misaligned request is converted to an aligned request before submission. - When O_DIRECT flag is not used, the original I/O request is submitted directly. v1 -> v2: * cleanup the free() logic in `blockif_init_bounced_write` Tracked-On: #8612 Signed-off-by: Shiqing Gao Acked-by: Wang, Yu1 --- devicemodel/hw/block_if.c | 490 ++++++++++++++++++++++++++++++++- devicemodel/include/block_if.h | 59 +++- 2 files changed, 535 insertions(+), 14 deletions(-) diff --git a/devicemodel/hw/block_if.c b/devicemodel/hw/block_if.c index 3cb6f6cad..f405f2561 100644 --- a/devicemodel/hw/block_if.c +++ b/devicemodel/hw/block_if.c @@ -161,6 +161,9 @@ struct blockif_ctxt { /* write cache enable */ uint8_t wce; + + /* whether bypass the Service VM's page cache or not */ + uint8_t bypass_host_cache; }; static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; @@ -362,21 +365,448 @@ blockif_process_discard(struct blockif_ctxt *bc, struct blockif_req *br) return 0; } +static void +blockif_init_iov_align_info(struct blockif_req *br) +{ + int i, size; + struct br_align_info *info = &br->align_info; + + size = 0; + info->is_iov_base_aligned = true; + info->is_iov_len_aligned = true; + + for (i = 0; i < br->iovcnt; i++) { + size += br->iov[i].iov_len; + + if ((uint64_t)(br->iov[i].iov_base) % info->alignment) { + info->is_iov_base_aligned = false; + } + + if (br->iov[i].iov_len % info->alignment) { + info->is_iov_len_aligned = false; + } + } + + info->org_size = size; + + return; +} + +/* only for debug purpose */ +static void +blockif_dump_align_info(struct blockif_req *br) +{ + struct br_align_info *info = &br->align_info; + int i; + + if (!info->is_offset_aligned) { + DPRINTF(("%s: Misaligned offset 0x%llx \n\r", __func__, (info->aligned_dn_start + info->head))); + } + + /* iov info */ + if (!info->is_iov_base_aligned) { + DPRINTF(("%s: Misaligned iov_base \n\r", __func__)); + } + if (!info->is_iov_len_aligned) { + DPRINTF(("%s: Misaligned iov_len \n\r", __func__)); + } + + DPRINTF(("%s: alignment %d, br->iovcnt %d \n\r", __func__, info->alignment, br->iovcnt)); + for (i = 0; i < br->iovcnt; i++) { + DPRINTF(("%s: iov[%d].iov_base 0x%llx (remainder %d), iov[%d].iov_len %d (remainder %d) \n\r", + __func__, + i, (uint64_t)(br->iov[i].iov_base), (uint64_t)(br->iov[i].iov_base) % info->alignment, + i, br->iov[i].iov_len, (br->iov[i].iov_len) % info->alignment)); + } + + /* overall info */ + DPRINTF(("%s: head %d, tail %d, org_size %d, bounced_size %d, aligned_dn_start 0x%lx aligned_dn_end 0x%lx \n\r", + __func__, info->head, info->tail, info->org_size, info->bounced_size, + info->aligned_dn_start, info->aligned_dn_end)); +} + +/* + * |<------------------------------------- bounced_size --------------------------------->| + * |<-------- alignment ------->| |<-------- alignment ------->| + * |<--- head --->|<------------------------ org_size ---------------------->|<-- tail -->| + * | | | | | | + * *--------------$-------------*----------- ... ------------*---------------$------------* + * | | | | | | + * | start end | + * aligned_dn_start aligned_dn_end + * |__________head_area_________| |__________tail_area_________| + * |<--- head --->| | |<-- end_rmd -->|<-- tail -->| + * |<-------- alignment ------->| |<-------- alignment ------->| + * + * + * Original access area: + * - start = br->offset + bc->sub_file_start_lba + * - org_size = SUM of org_iov[i].iov_len + * - end = start + org_size + * + * + * Head area to be bounced: + * - head = start % alignment + * - aligned_dn_start = start - head + * head | head_area + * -------------|------------- + * 0 | not exist + * non-zero | exist + * + * + * Tail area to be bounced: + * - end_rmd = end % alignment + * - aligned_dn_end = end - end_rmd + * end_rmd | tail | tail_area + * -------------|-----------------------|------------------ + * 0 | 0 | not exist + * non-zero | alignment - end_rmd | exist + * + * + * Overall bounced area: + * - bounced_size = head + org_size + tail + * + * + * Use a single bounce_iov to do the aligned READ/WRITE. + * - bounce_iov cnt = 1 + * - bounce_iov.iov_base = return of posix_memalign (aligned to @alignment) + * - bounce_iov.len = bounced_size + * - Accessing from the offset `aligned_dn_start` + * + * + * For READ access: + * 1. Do the aligned READ (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. + * 2. AFTER the aligned READ is completed, copy the data from the bounce_iov to the org_iov. + * from | length + * ------------------------------|--------------- + * bounce_iov.iov_base + head | org_size + * + * + * For WRITE access: + * 1. BEFORE the aligned WRITE is conducted, construct the bounced data with three parts in bounce_iov. + * (a). If head is not 0, get data of first alignment area -> head_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_start | alignment + * + * (b). If tail is not 0, get data of last alignment area -> tail_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_end | alignment + * + * (c). Construct the bounced data in bounce_iov + * from | to | length | source + * --------------------|------------------|---------------|--------------------------------- + * aligned_dn_start | start | head | head_area data from block device + * start | end | org_size | data specified in org_iov[] + * end | end + tail | tail | tail_area data from block device + * 2. Do the aligned WRITE (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. + * + * + */ +static void +blockif_init_alignment_info(struct blockif_ctxt *bc, struct blockif_req *br) +{ + struct br_align_info *info = &br->align_info; + uint32_t alignment = bc->sectsz; + uint32_t end_rmd; + off_t start, end; + bool all_aligned; + + /* If O_DIRECT flag is not used, does NOT need to initialize the alignment info. */ + if (!bc->bypass_host_cache) { + info->need_conversion = false; + return; + } + + start = br->offset + bc->sub_file_start_lba; + info->is_offset_aligned = (!(start % alignment)); + + info->alignment = alignment; + blockif_init_iov_align_info(br); + + all_aligned = (info->is_offset_aligned && info->is_iov_base_aligned && info->is_iov_len_aligned); + /* + * If O_DIRECT flag is used and the request is aligned, + * does NOT need to initialize the alignment info further. + */ + if (all_aligned) { + info->need_conversion = false; + return; + } + info->need_conversion = true; + + /* head area */ + info->head = start % alignment; + info->aligned_dn_start = start - info->head; + + /* tail area */ + end = start + info->org_size; + end_rmd = (end % alignment); + info->tail = (end_rmd == 0) ? (0) : (alignment - end_rmd); + info->aligned_dn_end = end - end_rmd; + + /* overall bounced area */ + info->bounced_size = info->head + info->org_size + info->tail; + + /* only for debug purpose */ + blockif_dump_align_info(br); + + return; +} + +/* + * Use a single bounce_iov to do the aligned READ/WRITE. + * - bounce_iov cnt = 1 + * - bounce_iov.iov_base = return of posix_memalign (aligned to @alignment) + * - bounce_iov.len = bounced_size + * - Accessing from the offset `aligned_dn_start` + */ +static int +blockif_init_bounce_iov(struct blockif_req *br) +{ + int ret = 0; + void *bounce_buf = NULL; + struct br_align_info *info = &br->align_info; + + ret = posix_memalign(&bounce_buf, info->alignment, info->bounced_size); + if (ret != 0) { + bounce_buf = NULL; + pr_err("%s: posix_memalign fails, error %s \n", __func__, strerror(-ret)); + } else { + info->bounce_iov.iov_base = bounce_buf; + info->bounce_iov.iov_len = info->bounced_size; + } + + return ret; +} + +static void +blockif_deinit_bounce_iov(struct blockif_req *br) +{ + struct br_align_info *info = &br->align_info; + + if (info->bounce_iov.iov_base == NULL) { + pr_err("%s: info->bounce_iov.iov_base is NULL %s \n", __func__); + return; + } + + free(info->bounce_iov.iov_base); + info->bounce_iov.iov_base = NULL; +} + +/* + * For READ access: + * 1. Do the aligned READ (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. + * 2. AFTER the aligned READ is completed, copy the data from the bounce_iov to the org_iov. + * from | length + * ------------------------------|--------------- + * bounce_iov.iov_base + head | org_size + */ +static void +blockif_complete_bounced_read(struct blockif_req *br) +{ + struct iovec *iov = br->iov; + struct br_align_info *info = &br->align_info; + int length = info->org_size; + int i, len, done; + + if (info->bounce_iov.iov_base == NULL) { + pr_err("%s: info->bounce_iov.iov_base is NULL %s \n", __func__); + return; + } + + done = info->head; + for (i = 0; i < br->iovcnt; i++) { + len = (iov[i].iov_len < length) ? iov[i].iov_len : length; + memcpy(iov[i].iov_base, info->bounce_iov.iov_base + done, len); + + done += len; + length -= len; + if (length <= 0) + break; + } + + return; +}; + +/* + * It is used to read out the head/tail area to construct the bounced data. + * + * Allocate an aligned buffer for @b_iov and do an aligned read from @offset (with length @alignment). + * @offset shall be guaranteed to be aligned by caller (either aligned_dn_start or aligned_dn_end). + */ +static int +blockif_read_head_or_tail_area(int fd, struct iovec *b_iov, off_t offset, uint32_t alignment) +{ + int ret = 0; + int bytes_read; + void *area = NULL; + + ret = posix_memalign(&area, alignment, alignment); + if (ret != 0) { + area = NULL; + pr_err("%s: posix_memalign fails, error %s \n", __func__, strerror(-ret)); + return ret; + } + + b_iov->iov_base = area; + b_iov->iov_len = alignment; + bytes_read = preadv(fd, b_iov, 1, offset); + + if (bytes_read < 0) { + pr_err("%s: read fails \n", __func__); + ret = errno; + } + + return ret; +} + +/* + * For WRITE access: + * 1. BEFORE the aligned WRITE is conducted, construct the bounced data with three parts in bounce_iov. + * (a). If head is not 0, get data of first alignment area -> head_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_start | alignment + * + * (b). If tail is not 0, get data of last alignment area -> tail_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_end | alignment + * + * (c). Construct the bounced data in bounce_iov + * from | to | length | source + * --------------------|------------------|---------------|--------------------------------- + * aligned_dn_start | start | head | head_area data from block device + * start | end | org_size | data specified in org_iov[] + * end | end + tail | tail | tail_area data from block device + * 2. Do the aligned WRITE (using `bounce_iov`) from the offset `aligned_dn_start`, with the length `bounced_size`. + */ +static int +blockif_init_bounced_write(struct blockif_ctxt *bc, struct blockif_req *br) +{ + struct iovec *iov = br->iov; + struct br_align_info *info = &br->align_info; + uint32_t alignment = info->alignment; + struct iovec head_iov, tail_iov; + uint32_t head = info->head; + uint32_t tail = info->tail; + int i, done, ret; + + ret = 0; + + if (info->bounce_iov.iov_base == NULL) { + pr_err("%s: info->bounce_iov.iov_base is NULL \n", __func__); + return -1; + } + + memset(&head_iov, 0, sizeof(head_iov)); + memset(&tail_iov, 0, sizeof(tail_iov)); + + /* + * If head is not 0, get data of first alignment area, head_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_start | alignment + */ + if (head != 0) { + ret = blockif_read_head_or_tail_area(bc->fd, &head_iov, info->aligned_dn_start, alignment); + if (ret < 0) { + pr_err("%s: fails to read out the head area \n", __func__); + goto end; + } + } + + /* + * If tail is not 0, get data of last alignment area, tail_area data (by doing aligned read) + * from | length + * --------------------|--------------- + * aligned_dn_end | alignment + */ + if (tail != 0) { + ret = blockif_read_head_or_tail_area(bc->fd, &tail_iov, info->aligned_dn_end, alignment); + if (ret < 0) { + pr_err("%s: fails to read out the tail area \n", __func__); + goto end; + } + } + + done = 0; + /* + * Construct the bounced data in bounce_iov + * from | to | length | source + * --------------------|------------------|---------------|--------------------------------- + * aligned_dn_start | start | head | head_area data from block device + * start | end | org_size | data specified in org_iov[] + * end | end + tail | tail | tail_area data from block device + */ + if (head_iov.iov_base != NULL) { + memcpy(info->bounce_iov.iov_base, head_iov.iov_base, head); + done += head; + } + + /* data specified in org_iov[] */ + for (i = 0; i < br->iovcnt; i++) { + memcpy(info->bounce_iov.iov_base + done, iov[i].iov_base, iov[i].iov_len); + done += iov[i].iov_len; + } + + if (tail_iov.iov_base != NULL) { + memcpy(info->bounce_iov.iov_base + done, tail_iov.iov_base + alignment - tail, tail); + done += tail; + } + +end: + if (head_iov.iov_base != NULL) { + free(head_iov.iov_base); + } + + if (tail_iov.iov_base != NULL) { + free(tail_iov.iov_base); + } + + return ret; +}; + static void blockif_proc(struct blockif_queue *bq, struct blockif_elem *be) { struct blockif_req *br; struct blockif_ctxt *bc; - ssize_t len; + struct br_align_info *info; + ssize_t len, iovcnt; + struct iovec *iovecs; + off_t offset; int err; br = be->req; bc = bq->bc; + info = &br->align_info; err = 0; + + if ((be->op == BOP_READ) || (be->op == BOP_WRITE)) { + if (info->need_conversion) { + /* bounce_iov has been initialized in blockif_request */ + iovecs = &(info->bounce_iov); + iovcnt = 1; + offset = info->aligned_dn_start; + } else { + /* use the original iov if no conversion is required */ + iovecs = br->iov; + iovcnt = br->iovcnt; + offset = br->offset + bc->sub_file_start_lba; + } + } + switch (be->op) { case BOP_READ: - len = preadv(bc->fd, br->iov, br->iovcnt, - br->offset + bc->sub_file_start_lba); + len = preadv(bc->fd, iovecs, iovcnt, offset); + if (info->need_conversion) { + blockif_complete_bounced_read(br); + blockif_deinit_bounce_iov(br); + } + if (len < 0) err = errno; else @@ -388,8 +818,11 @@ blockif_proc(struct blockif_queue *bq, struct blockif_elem *be) break; } - len = pwritev(bc->fd, br->iov, br->iovcnt, - br->offset + bc->sub_file_start_lba); + len = pwritev(bc->fd, iovecs, iovcnt, offset); + if (info->need_conversion) { + blockif_deinit_bounce_iov(br); + } + if (len < 0) err = errno; else { @@ -596,18 +1029,36 @@ iou_submit_sqe(struct blockif_queue *bq, struct blockif_elem *be) struct io_uring_sqe *sqes = io_uring_get_sqe(ring); struct blockif_req *br = be->req; struct blockif_ctxt *bc = bq->bc; + struct br_align_info *info = &br->align_info; + struct iovec *iovecs; + size_t iovcnt; + off_t offset; if (!sqes) { pr_err("%s: io_uring_get_sqe fails. NO available submission queue entry. \n", __func__); return -1; } + if ((be->op == BOP_READ) || (be->op == BOP_WRITE)) { + if (info->need_conversion) { + /* bounce_iov has been initialized in blockif_request */ + iovecs = &(info->bounce_iov); + iovcnt = 1; + offset = info->aligned_dn_start; + } else { + /* use the original iov if no conversion is required */ + iovecs = br->iov; + iovcnt = br->iovcnt; + offset = br->offset + bc->sub_file_start_lba; + } + } + switch (be->op) { case BOP_READ: - io_uring_prep_readv(sqes, bc->fd, br->iov, br->iovcnt, br->offset + bc->sub_file_start_lba); + io_uring_prep_readv(sqes, bc->fd, iovecs, iovcnt, offset); break; case BOP_WRITE: - io_uring_prep_writev(sqes, bc->fd, br->iov, br->iovcnt, br->offset + bc->sub_file_start_lba); + io_uring_prep_writev(sqes, bc->fd, iovecs, iovcnt, offset); break; case BOP_FLUSH: io_uring_prep_fsync(sqes, bc->fd, IORING_FSYNC_DATASYNC); @@ -691,6 +1142,14 @@ iou_process_completions(struct blockif_queue *bq) break; } + /* when a misaligned request is converted to an aligned one, need to do some post-work */ + if (br->align_info.need_conversion) { + if (be->op == BOP_READ) { + blockif_complete_bounced_read(br); + } + blockif_deinit_bounce_iov(br); + } + be->status = BST_DONE; (*br->callback)(br, 0); blockif_complete(bq, be); @@ -1089,6 +1548,7 @@ blockif_open(const char *optstr, const char *ident, int queue_num, struct iothre bc->psectsz = psectsz; bc->psectoff = psectoff; bc->wce = writeback; + bc->bypass_host_cache = bypass_host_cache; bc->aio_mode = aio_mode; if (bc->aio_mode == AIO_MODE_IO_URING) { @@ -1173,6 +1633,22 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, } bq = bc->bqs + breq->qidx; + blockif_init_alignment_info(bc, breq); + /* For misaligned READ/WRITE, need a bounce_iov to convert the misaligned request to an aligned one. */ + if (((op == BOP_READ) || (op == BOP_WRITE)) && (breq->align_info.need_conversion)) { + err = blockif_init_bounce_iov(breq); + if (err < 0) { + return err; + } + + if (op == BOP_WRITE) { + err = blockif_init_bounced_write(bc, breq); + if (err < 0) { + return err; + } + } + } + if (bc->ops->mutex_lock) { bc->ops->mutex_lock(&bq->mtx); } diff --git a/devicemodel/include/block_if.h b/devicemodel/include/block_if.h index ffa2efe4b..f15b302c4 100644 --- a/devicemodel/include/block_if.h +++ b/devicemodel/include/block_if.h @@ -43,14 +43,59 @@ #define BLOCKIF_IOV_MAX 256 /* not practical to be IOV_MAX */ +/* + * |<------------------------------------- bounced_size --------------------------------->| + * |<-------- alignment ------->| |<-------- alignment ------->| + * |<--- head --->|<------------------------ org_size ---------------------->|<-- tail -->| + * | | | | | | + * *--------------$-------------*----------- ... ------------*---------------$------------* + * | | | | | | + * | start end | + * aligned_dn_start aligned_dn_end + * |__________head_area_________| |__________tail_area_________| + * |<--- head --->| | |<-- end_rmd -->|<-- tail -->| + * |<-------- alignment ------->| |<-------- alignment ------->| + * + */ +struct br_align_info { + uint32_t alignment; + + bool is_iov_base_aligned; + bool is_iov_len_aligned; + bool is_offset_aligned; + + /* + * Needs to convert the misaligned request to an aligned one when + * O_DIRECT is used, but the request (either buffer address/length, or offset) is not aligned. + */ + bool need_conversion; + + uint32_t head; + uint32_t tail; + uint32_t org_size; + uint32_t bounced_size; + + off_t aligned_dn_start; + off_t aligned_dn_end; + + /* + * A bounce_iov for aligned read/write access. + * bounce_iov.iov_base is aligned to @alignment + * bounce_iov.iov_len is @bounced_size (@head + @org_size + @tail) + */ + struct iovec bounce_iov; +}; + struct blockif_req { - struct iovec iov[BLOCKIF_IOV_MAX]; - int iovcnt; - off_t offset; - ssize_t resid; - void (*callback)(struct blockif_req *req, int err); - void *param; - int qidx; + struct iovec iov[BLOCKIF_IOV_MAX]; + int iovcnt; + off_t offset; + ssize_t resid; + void (*callback)(struct blockif_req *req, int err); + void *param; + int qidx; + + struct br_align_info align_info; }; struct blockif_ctxt;