dm: storage: support discard command

Support DISCARD command is meaningful when eMMC usage is high or
there are lots of remove operations. For example, when Guest
Android is running, there will be lots of files being created and
removed. However, virtio-blk BE does not support DISCARD command,
data remove operation in UOS will not trigger erase in eMMC. After
period of time, the eMMC will be consumed out, and erase must be
done by eMMC firmware before writing any new data. This causes the
eMMC performance decrease in the whole system (SOS and UOS).
To solve the problem, DISCARD should be supported in virtio-blk BE.

Tracked-On: #2011
Signed-off-by: Conghui Chen <conghui.chen@intel.com>
Reviewed-by: Shuo A Liu <shuo.a.liu@intel.com>
Acked-by: Yu Wang <yu1.wang@intel.com>
This commit is contained in:
Conghui Chen 2018-12-14 09:50:15 +08:00 committed by wenlingz
parent f71370ad81
commit 2ddd24e022
3 changed files with 190 additions and 22 deletions

View File

@ -30,6 +30,7 @@
#include <sys/queue.h> #include <sys/queue.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <linux/falloc.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <errno.h> #include <errno.h>
#include <assert.h> #include <assert.h>
@ -63,6 +64,7 @@
#define BLOCKIF_NUMTHR 8 #define BLOCKIF_NUMTHR 8
#define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR) #define BLOCKIF_MAXREQ (64 + BLOCKIF_NUMTHR)
#define MAX_DISCARD_SEGMENT 256
/* /*
* Debug printf * Debug printf
@ -108,6 +110,9 @@ struct blockif_ctxt {
int sectsz; int sectsz;
int psectsz; int psectsz;
int psectoff; int psectoff;
int max_discard_sectors;
int max_discard_seg;
int discard_sector_alignment;
int closing; int closing;
pthread_t btid[BLOCKIF_NUMTHR]; pthread_t btid[BLOCKIF_NUMTHR];
pthread_mutex_t mtx; pthread_mutex_t mtx;
@ -132,6 +137,12 @@ struct blockif_sig_elem {
struct blockif_sig_elem *next; struct blockif_sig_elem *next;
}; };
struct discard_range {
uint64_t sector;
uint32_t num_sectors;
uint32_t flags;
};
static struct blockif_sig_elem *blockif_bse_head; static struct blockif_sig_elem *blockif_bse_head;
static int static int
@ -232,11 +243,92 @@ blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
TAILQ_INSERT_TAIL(&bc->freeq, be, link); TAILQ_INSERT_TAIL(&bc->freeq, be, link);
} }
static int
discard_range_validate(struct blockif_ctxt *bc, off_t start, off_t size)
{
off_t start_sector = start / DEV_BSIZE;
off_t size_sector = size / DEV_BSIZE;
if (!size || (start + size) > (bc->size + bc->sub_file_start_lba))
return -1;
if ((size_sector > bc->max_discard_sectors) ||
(bc->discard_sector_alignment &&
start_sector % bc->discard_sector_alignment))
return -1;
return 0;
}
static int
blockif_process_discard(struct blockif_ctxt *bc, struct blockif_req *br)
{
int err;
struct discard_range *range;
int n_range, i, segment;
off_t arg[MAX_DISCARD_SEGMENT][2];
err = 0;
n_range = 0;
segment = 0;
if (!bc->candiscard)
return EOPNOTSUPP;
if (bc->rdonly)
return EROFS;
if (br->iovcnt == 1) {
/* virtio-blk use iov to transfer discard range */
n_range = br->iov[0].iov_len/sizeof(*range);
range = br->iov[0].iov_base;
for (i = 0; i < n_range; i++) {
arg[i][0] = range[i].sector * DEV_BSIZE +
bc->sub_file_start_lba;
arg[i][1] = range[i].num_sectors * DEV_BSIZE;
segment++;
if (segment > bc->max_discard_seg) {
WPRINTF(("segment > max_discard_seg\n"));
return EINVAL;
}
if (discard_range_validate(bc, arg[i][0], arg[i][1])) {
WPRINTF(("range [%ld: %ld] is invalid\n", arg[i][0], arg[i][1]));
return EINVAL;
}
}
} else {
/* ahci parse discard range to br->offset and br->reside */
arg[0][0] = br->offset + bc->sub_file_start_lba;
arg[0][1] = br->resid;
segment = 1;
}
for (i = 0; i < segment; i++) {
if (bc->isblk) {
err = ioctl(bc->fd, BLKDISCARD, arg[i]);
} else {
/* FALLOC_FL_PUNCH_HOLE:
* Deallocates space in the byte range starting at offset and
* continuing for length bytes. After a successful call,
* subsequent reads from this range will return zeroes.
* FALLOC_FL_KEEP_SIZE:
* Do not modify the apparent length of the file.
*/
err = fallocate(bc->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
arg[i][0], arg[i][1]);
}
if (err) {
WPRINTF(("Failed to discard offset=%ld nbytes=%ld err code: %d\n",
arg[i][0], arg[i][1], err));
return err;
}
}
br->resid = 0;
return 0;
}
static void static void
blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
{ {
struct blockif_req *br; struct blockif_req *br;
off_t arg[2];
ssize_t len; ssize_t len;
int err; int err;
@ -271,21 +363,7 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
err = errno; err = errno;
break; break;
case BOP_DISCARD: case BOP_DISCARD:
/* only used by AHCI */ err = blockif_process_discard(bc, br);
if (!bc->candiscard)
err = EOPNOTSUPP;
else if (bc->rdonly)
err = EROFS;
else if (bc->isblk) {
arg[0] = br->offset;
arg[1] = br->resid;
if (ioctl(bc->fd, BLKDISCARD, arg))
err = errno;
else
br->resid = 0;
}
else
err = EOPNOTSUPP;
break; break;
default: default:
err = EINVAL; err = EINVAL;
@ -425,6 +503,8 @@ blockif_open(const char *optstr, const char *ident)
int err_code = -1; int err_code = -1;
off_t sub_file_start_lba, sub_file_size; off_t sub_file_start_lba, sub_file_size;
int sub_file_assign; int sub_file_assign;
int max_discard_sectors, max_discard_seg, discard_sector_alignment;
off_t probe_arg[] = {0, 0};
pthread_once(&blockif_once, blockif_init); pthread_once(&blockif_once, blockif_init);
@ -436,9 +516,15 @@ blockif_open(const char *optstr, const char *ident)
sub_file_start_lba = 0; sub_file_start_lba = 0;
sub_file_size = 0; sub_file_size = 0;
max_discard_sectors = -1;
max_discard_seg = -1;
discard_sector_alignment = -1;
/* writethru is on by default */ /* writethru is on by default */
writeback = 0; writeback = 0;
candiscard = 0;
/* /*
* The first element in the optstring is always a pathname. * The first element in the optstring is always a pathname.
* Optional elements follow * Optional elements follow
@ -458,7 +544,18 @@ blockif_open(const char *optstr, const char *ident)
writeback = 0; writeback = 0;
else if (!strcmp(cp, "ro")) else if (!strcmp(cp, "ro"))
ro = 1; ro = 1;
else if (!strncmp(cp, "sectorsize", strlen("sectorsize"))) { else if (!strncmp(cp, "discard", strlen("discard"))) {
strsep(&cp, "=");
if (cp != NULL) {
if (!(!dm_strtoi(cp, &cp, 10, &max_discard_sectors) &&
*cp == ':' &&
!dm_strtoi(cp + 1, &cp, 10, &max_discard_seg) &&
*cp == ':' &&
!dm_strtoi(cp + 1, &cp, 10, &discard_sector_alignment)))
goto err;
}
candiscard = 1;
} else if (!strncmp(cp, "sectorsize", strlen("sectorsize"))) {
/* /*
* sectorsize=<sector size> * sectorsize=<sector size>
* or * or
@ -517,7 +614,6 @@ blockif_open(const char *optstr, const char *ident)
size = sbuf.st_size; size = sbuf.st_size;
sectsz = DEV_BSIZE; sectsz = DEV_BSIZE;
psectsz = psectoff = 0; psectsz = psectoff = 0;
candiscard = 0;
if (S_ISBLK(sbuf.st_mode)) { if (S_ISBLK(sbuf.st_mode)) {
/* get size */ /* get size */
@ -552,8 +648,22 @@ blockif_open(const char *optstr, const char *ident)
DPRINTF(("block partition physical sector size is 0x%lx\n", DPRINTF(("block partition physical sector size is 0x%lx\n",
psectsz)); psectsz));
} else if (candiscard) {
err_code = ioctl(fd, BLKDISCARD, probe_arg);
if (err_code) {
WPRINTF(("not support DISCARD\n"));
candiscard = 0;
}
}
} else {
if (size < DEV_BSIZE || (size & (DEV_BSIZE - 1))) {
WPRINTF(("%s size not corret, should be multiple of %d\n",
nopt, DEV_BSIZE));
return 0;
}
psectsz = sbuf.st_blksize; psectsz = sbuf.st_blksize;
}
if (ssopt != 0) { if (ssopt != 0) {
if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
@ -613,6 +723,15 @@ blockif_open(const char *optstr, const char *ident)
bc->fd = fd; bc->fd = fd;
bc->isblk = S_ISBLK(sbuf.st_mode); bc->isblk = S_ISBLK(sbuf.st_mode);
bc->candiscard = candiscard; bc->candiscard = candiscard;
if (candiscard) {
bc->max_discard_sectors =
(max_discard_sectors != -1) ?
max_discard_sectors : (size / DEV_BSIZE);
bc->max_discard_seg =
(max_discard_seg != -1) ? max_discard_seg : 1;
bc->discard_sector_alignment =
(discard_sector_alignment != -1) ? discard_sector_alignment : 0;
}
bc->rdonly = ro; bc->rdonly = ro;
bc->size = size; bc->size = size;
bc->sectsz = sectsz; bc->sectsz = sectsz;
@ -906,6 +1025,27 @@ blockif_candiscard(struct blockif_ctxt *bc)
return bc->candiscard; return bc->candiscard;
} }
int
blockif_max_discard_sectors(struct blockif_ctxt *bc)
{
assert(bc->magic == BLOCKIF_SIG);
return bc->max_discard_sectors;
}
int
blockif_max_discard_seg(struct blockif_ctxt *bc)
{
assert(bc->magic == BLOCKIF_SIG);
return bc->max_discard_seg;
}
int
blockif_discard_sector_alignment(struct blockif_ctxt *bc)
{
assert(bc->magic == BLOCKIF_SIG);
return bc->discard_sector_alignment;
}
uint8_t uint8_t
blockif_get_wce(struct blockif_ctxt *bc) blockif_get_wce(struct blockif_ctxt *bc)
{ {

View File

@ -58,6 +58,8 @@
/* Device can toggle its cache between writeback and writethrough modes */ /* Device can toggle its cache between writeback and writethrough modes */
#define VIRTIO_BLK_F_CONFIG_WCE (1 << 11) #define VIRTIO_BLK_F_CONFIG_WCE (1 << 11)
#define VIRTIO_BLK_F_DISCARD (1 << 13)
/* /*
* Basic device capabilities * Basic device capabilities
*/ */
@ -94,6 +96,15 @@ struct virtio_blk_config {
uint32_t opt_io_size; uint32_t opt_io_size;
} topology; } topology;
uint8_t writeback; uint8_t writeback;
uint8_t unused;
/* Reserve for num_queues when VIRTIO_BLK_F_MQ is support*/
uint16_t reserve;
/* The maximum discard sectors (in 512-byte sectors) for one segment */
uint32_t max_discard_sectors;
/* The maximum number of discard segments */
uint32_t max_discard_seg;
/* Discard commands must be aligned to this number of sectors. */
uint32_t discard_sector_alignment;
} __attribute__((packed)); } __attribute__((packed));
/* /*
@ -105,6 +116,7 @@ struct virtio_blk_hdr {
#define VBH_OP_FLUSH 4 #define VBH_OP_FLUSH 4
#define VBH_OP_FLUSH_OUT 5 #define VBH_OP_FLUSH_OUT 5
#define VBH_OP_IDENT 8 #define VBH_OP_IDENT 8
#define VBH_OP_DISCARD 11
#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into type */ #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into type */
uint32_t type; uint32_t type;
uint32_t ioprio; uint32_t ioprio;
@ -234,12 +246,13 @@ virtio_blk_proc(struct virtio_blk *blk, struct virtio_vq_info *vq)
* we don't advertise the capability. * we don't advertise the capability.
*/ */
type = vbh->type & ~VBH_FLAG_BARRIER; type = vbh->type & ~VBH_FLAG_BARRIER;
writeop = (type == VBH_OP_WRITE); writeop = ((type == VBH_OP_WRITE) ||
(type == VBH_OP_DISCARD));
iolen = 0; iolen = 0;
for (i = 1; i < n; i++) { for (i = 1; i < n; i++) {
/* /*
* - write op implies read-only descriptor, * - write/discard op implies read-only descriptor,
* - read/ident op implies write-only descriptor, * - read/ident op implies write-only descriptor,
* therefore test the inverse of the descriptor bit * therefore test the inverse of the descriptor bit
* to the op. * to the op.
@ -250,7 +263,7 @@ virtio_blk_proc(struct virtio_blk *blk, struct virtio_vq_info *vq)
io->req.resid = iolen; io->req.resid = iolen;
DPRINTF(("virtio_blk: %s op, %zd bytes, %d segs, offset %ld\n\r", DPRINTF(("virtio_blk: %s op, %zd bytes, %d segs, offset %ld\n\r",
writeop ? "write" : "read/ident", iolen, i - 1, writeop ? "write/discard" : "read/ident", iolen, i - 1,
io->req.offset)); io->req.offset));
switch (type) { switch (type) {
@ -279,6 +292,9 @@ virtio_blk_proc(struct virtio_blk *blk, struct virtio_vq_info *vq)
err = ((type == VBH_OP_READ) ? blockif_read : blockif_write) err = ((type == VBH_OP_READ) ? blockif_read : blockif_write)
(blk->bc, &io->req); (blk->bc, &io->req);
break; break;
case VBH_OP_DISCARD:
err = blockif_discard(blk->bc, &io->req);
break;
case VBH_OP_FLUSH: case VBH_OP_FLUSH:
case VBH_OP_FLUSH_OUT: case VBH_OP_FLUSH_OUT:
err = blockif_flush(blk->bc, &io->req); err = blockif_flush(blk->bc, &io->req);
@ -315,6 +331,10 @@ virtio_blk_get_caps(struct virtio_blk *blk, bool wb)
caps = VIRTIO_BLK_S_HOSTCAPS; caps = VIRTIO_BLK_S_HOSTCAPS;
if (wb) if (wb)
caps |= VIRTIO_BLK_F_WB_BITS; caps |= VIRTIO_BLK_F_WB_BITS;
if (blockif_candiscard(blk->bc))
caps |= VIRTIO_BLK_F_DISCARD;
return caps; return caps;
} }
@ -420,6 +440,11 @@ virtio_blk_init(struct vmctx *ctx, struct pci_vdev *dev, char *opts)
blk->cfg.topology.opt_io_size = 0; blk->cfg.topology.opt_io_size = 0;
blk->cfg.writeback = blockif_get_wce(blk->bc); blk->cfg.writeback = blockif_get_wce(blk->bc);
blk->original_wce = blk->cfg.writeback; /* save for reset */ blk->original_wce = blk->cfg.writeback; /* save for reset */
if (blockif_candiscard(blk->bc)) {
blk->cfg.max_discard_sectors = blockif_max_discard_sectors(blk->bc);
blk->cfg.max_discard_seg = blockif_max_discard_seg(blk->bc);
blk->cfg.discard_sector_alignment = blockif_discard_sector_alignment(blk->bc);
}
blk->base.device_caps = blk->base.device_caps =
virtio_blk_get_caps(blk, !!blk->cfg.writeback); virtio_blk_get_caps(blk, !!blk->cfg.writeback);

View File

@ -69,5 +69,8 @@ int blockif_close(struct blockif_ctxt *bc);
uint8_t blockif_get_wce(struct blockif_ctxt *bc); uint8_t blockif_get_wce(struct blockif_ctxt *bc);
void blockif_set_wce(struct blockif_ctxt *bc, uint8_t wce); void blockif_set_wce(struct blockif_ctxt *bc, uint8_t wce);
int blockif_flush_all(struct blockif_ctxt *bc); int blockif_flush_all(struct blockif_ctxt *bc);
int blockif_max_discard_sectors(struct blockif_ctxt *bc);
int blockif_max_discard_seg(struct blockif_ctxt *bc);
int blockif_discard_sector_alignment(struct blockif_ctxt *bc);
#endif /* _BLOCK_IF_H_ */ #endif /* _BLOCK_IF_H_ */