diff options
81 files changed, 1986 insertions, 4426 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd deleted file mode 100644 index f6f65a4faea0..000000000000 --- a/Documentation/ABI/testing/debugfs-pktcdvd +++ /dev/null @@ -1,18 +0,0 @@ -What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7] -Date: Oct. 2006 -KernelVersion: 2.6.20 -Contact: Thomas Maier <balagi@justmail.de> -Description: - -The pktcdvd module (packet writing driver) creates -these files in debugfs: - -/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/ - - ==== ====== ==================================== - info 0444 Lots of driver statistics and infos. - ==== ====== ==================================== - -Example:: - - cat /sys/kernel/debug/pktcdvd/pktcdvd0/info diff --git a/Documentation/ABI/testing/sysfs-class-pktcdvd b/Documentation/ABI/testing/sysfs-class-pktcdvd deleted file mode 100644 index ba1ce626591d..000000000000 --- a/Documentation/ABI/testing/sysfs-class-pktcdvd +++ /dev/null @@ -1,97 +0,0 @@ -sysfs interface ---------------- -The pktcdvd module (packet writing driver) creates the following files in the -sysfs: (<devid> is in the format major:minor) - -What: /sys/class/pktcdvd/add -What: /sys/class/pktcdvd/remove -What: /sys/class/pktcdvd/device_map -Date: Oct. 2006 -KernelVersion: 2.6.20 -Contact: Thomas Maier <balagi@justmail.de> -Description: - - ========== ============================================== - add (WO) Write a block device id (major:minor) to - create a new pktcdvd device and map it to the - block device. - - remove (WO) Write the pktcdvd device id (major:minor) - to remove the pktcdvd device. - - device_map (RO) Shows the device mapping in format: - pktcdvd[0-7] <pktdevid> <blkdevid> - ========== ============================================== - - -What: /sys/class/pktcdvd/pktcdvd[0-7]/dev -What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent -Date: Oct. 2006 -KernelVersion: 2.6.20 -Contact: Thomas Maier <balagi@justmail.de> -Description: - dev: (RO) Device id - - uevent: (WO) To send a uevent - - -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather -What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset -Date: Oct. 2006 -KernelVersion: 2.6.20 -Contact: Thomas Maier <balagi@justmail.de> -Description: - packets_started: (RO) Number of started packets. - - packets_finished: (RO) Number of finished packets. - - kb_written: (RO) kBytes written. - - kb_read: (RO) kBytes read. - - kb_read_gather: (RO) kBytes read to fill write packets. - - reset: (WO) Write any value to it to reset - pktcdvd device statistic values, like - bytes read/written. - - -What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size -What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off -What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on -Date: Oct. 2006 -KernelVersion: 2.6.20 -Contact: Thomas Maier <balagi@justmail.de> -Description: - ============== ================================================ - size (RO) Contains the size of the bio write queue. - - congestion_off (RW) If bio write queue size is below this mark, - accept new bio requests from the block layer. - - congestion_on (RW) If bio write queue size is higher as this - mark, do no longer accept bio write requests - from the block layer and wait till the pktcdvd - device has processed enough bio's so that bio - write queue size is below congestion off mark. - A value of <= 0 disables congestion control. - ============== ================================================ - - -Example: --------- -To use the pktcdvd sysfs interface directly, you can do:: - - # create a new pktcdvd device mapped to /dev/hdc - echo "22:0" >/sys/class/pktcdvd/add - cat /sys/class/pktcdvd/device_map - # assuming device pktcdvd0 was created, look at stat's - cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written - # print the device id of the mapped block device - fgrep pktcdvd0 /sys/class/pktcdvd/device_map - # remove device, using pktcdvd0 device id 253:0 - echo "253:0" >/sys/class/pktcdvd/remove diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst index 6c1303cff159..b97a4e9b9bd3 100644 --- a/Documentation/cdrom/cdrom-standard.rst +++ b/Documentation/cdrom/cdrom-standard.rst @@ -273,7 +273,6 @@ The drive-specific, minor-like information that is registered with __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ unsigned short mmc3_profile; /* current MMC3 profile */ int for_data; /* unknown:TBD */ - int (*exit)(struct cdrom_device_info *);/* unknown:TBD */ int mrw_mode_page; /* which MRW mode page is in use */ }; diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst index e9b022d70939..3ac4f716612f 100644 --- a/Documentation/cdrom/index.rst +++ b/Documentation/cdrom/index.rst @@ -8,7 +8,6 @@ CD-ROM :maxdepth: 1 cdrom-standard - packet-writing .. only:: subproject and html diff --git a/Documentation/cdrom/packet-writing.rst b/Documentation/cdrom/packet-writing.rst deleted file mode 100644 index 43db58c50d29..000000000000 --- a/Documentation/cdrom/packet-writing.rst +++ /dev/null @@ -1,139 +0,0 @@ -============== -Packet writing -============== - -Getting started quick ---------------------- - -- Select packet support in the block device section and UDF support in - the file system section. - -- Compile and install kernel and modules, reboot. - -- You need the udftools package (pktsetup, mkudffs, cdrwtool). - Download from https://github.com/pali/udftools - -- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute - as appropriate):: - - # cdrwtool -d /dev/hdc -q - -- Setup your writer:: - - # pktsetup dev_name /dev/hdc - -- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy:: - - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD-RW media -------------------------------- - -DVD-RW discs can be written to much like CD-RW discs if they are in -the so called "restricted overwrite" mode. To put a disc in restricted -overwrite mode, run:: - - # dvd+rw-format /dev/hdc - -You can then use the disc the same way you would use a CD-RW disc:: - - # pktsetup dev_name /dev/hdc - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD+RW media -------------------------------- - -According to the DVD+RW specification, a drive supporting DVD+RW discs -shall implement "true random writes with 2KB granularity", which means -that it should be possible to put any filesystem with a block size >= -2KB on such a disc. For example, it should be possible to do:: - - # dvd+rw-format /dev/hdc (only needed if the disc has never - been formatted) - # mkudffs /dev/hdc - # mount /dev/hdc /cdrom -t udf -o rw,noatime - -However, some drives don't follow the specification and expect the -host to perform aligned writes at 32KB boundaries. Other drives do -follow the specification, but suffer bad performance problems if the -writes are not 32KB aligned. - -Both problems can be solved by using the pktcdvd driver, which always -generates aligned writes:: - - # dvd+rw-format /dev/hdc - # pktsetup dev_name /dev/hdc - # mkudffs /dev/pktcdvd/dev_name - # mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime - - -Packet writing for DVD-RAM media --------------------------------- - -DVD-RAM discs are random writable, so using the pktcdvd driver is not -necessary. However, using the pktcdvd driver can improve performance -in the same way it does for DVD+RW media. - - -Notes ------ - -- CD-RW media can usually not be overwritten more than about 1000 - times, so to avoid unnecessary wear on the media, you should always - use the noatime mount option. - -- Defect management (ie automatic remapping of bad sectors) has not - been implemented yet, so you are likely to get at least some - filesystem corruption if the disc wears out. - -- Since the pktcdvd driver makes the disc appear as a regular block - device with a 2KB block size, you can put any filesystem you like on - the disc. For example, run:: - - # /sbin/mke2fs /dev/pktcdvd/dev_name - - to create an ext2 filesystem on the disc. - - -Using the pktcdvd sysfs interface ---------------------------------- - -Since Linux 2.6.20, the pktcdvd module has a sysfs interface -and can be controlled by it. For example the "pktcdvd" tool uses -this interface. (see http://tom.ist-im-web.de/linux/software/pktcdvd ) - -"pktcdvd" works similar to "pktsetup", e.g.:: - - # pktcdvd -a dev_name /dev/hdc - # mkudffs /dev/pktcdvd/dev_name - # mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram - # cp files /dvdram - # umount /dvdram - # pktcdvd -r dev_name - - -For a description of the sysfs interface look into the file: - - Documentation/ABI/testing/sysfs-class-pktcdvd - - -Using the pktcdvd debugfs interface ------------------------------------ - -To read pktcdvd device infos in human readable form, do:: - - # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info - -For a description of the debugfs interface look into the file: - - Documentation/ABI/testing/debugfs-pktcdvd - - - -Links ------ - -See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information -about DVD writing. diff --git a/Documentation/nvme/nvme-pci-endpoint-target.rst b/Documentation/nvme/nvme-pci-endpoint-target.rst index b699595d1762..69edf44f0d9a 100644 --- a/Documentation/nvme/nvme-pci-endpoint-target.rst +++ b/Documentation/nvme/nvme-pci-endpoint-target.rst @@ -6,21 +6,21 @@ NVMe PCI Endpoint Function Target :Author: Damien Le Moal <dlemoal@kernel.org> -The NVMe PCI endpoint function target driver implements a NVMe PCIe controller -using a NVMe fabrics target controller configured with the PCI transport type. +The NVMe PCI endpoint function target driver implements an NVMe PCIe controller +using an NVMe fabrics target controller configured with the PCI transport type. Overview ======== -The NVMe PCI endpoint function target driver allows exposing a NVMe target +The NVMe PCI endpoint function target driver allows exposing an NVMe target controller over a PCIe link, thus implementing an NVMe PCIe device similar to a regular M.2 SSD. The target controller is created in the same manner as when using NVMe over fabrics: the controller represents the interface to an NVMe subsystem using a port. The port transfer type must be configured to be "pci". The subsystem can be configured to have namespaces backed by regular files or block devices, or can use NVMe passthrough to expose to the PCI host an -existing physical NVMe device or a NVMe fabrics host controller (e.g. a NVMe TCP -host controller). +existing physical NVMe device or an NVMe fabrics host controller (e.g. a NVMe +TCP host controller). The NVMe PCI endpoint function target driver relies as much as possible on the NVMe target core code to parse and execute NVMe commands submitted by the PCIe @@ -181,10 +181,10 @@ Creating an NVMe endpoint device is a two step process. First, an NVMe target subsystem and port must be defined. Second, the NVMe PCI endpoint device must be setup and bound to the subsystem and port created. -Creating a NVMe Subsystem and Port ----------------------------------- +Creating an NVMe Subsystem and Port +----------------------------------- -Details about how to configure a NVMe target subsystem and port are outside the +Details about how to configure an NVMe target subsystem and port are outside the scope of this document. The following only provides a simple example of a port and subsystem with a single namespace backed by a null_blk device. @@ -234,8 +234,8 @@ Finally, create the target port and link it to the subsystem:: # ln -s /sys/kernel/config/nvmet/subsystems/nvmepf.0.nqn \ /sys/kernel/config/nvmet/ports/1/subsystems/nvmepf.0.nqn -Creating a NVMe PCI Endpoint Device ------------------------------------ +Creating an NVMe PCI Endpoint Device +------------------------------------ With the NVMe target subsystem and port ready for use, the NVMe PCI endpoint device can now be created and enabled. The NVMe PCI endpoint target driver @@ -303,7 +303,7 @@ device controller:: nvmet_pci_epf nvmet_pci_epf.0: Enabling controller -On the host side, the NVMe PCI endpoint function target device will is +On the host side, the NVMe PCI endpoint function target device is discoverable as a PCI device, with the vendor ID and device ID as configured:: # lspci -n diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index bc91756bde73..4f1532a251d2 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -220,7 +220,6 @@ Code Seq# Include File Comments include/linux/falloc.h, linux/fs.h, 'X' all fs/ocfs2/ocfs_fs.h conflict! -'X' 01 linux/pktcdvd.h conflict! 'Z' 14-15 drivers/message/fusion/mptctl.h '[' 00-3F linux/usb/tmc.h USB Test and Measurement Devices <mailto:gregkh@linuxfoundation.org> diff --git a/MAINTAINERS b/MAINTAINERS index 6a918dd74974..aca321ed5323 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19706,13 +19706,6 @@ S: Supported F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml F: drivers/input/keyboard/pinephone-keyboard.c -PKTCDVD DRIVER -M: linux-block@vger.kernel.org -S: Orphan -F: drivers/block/pktcdvd.c -F: include/linux/pktcdvd.h -F: include/uapi/linux/pktcdvd.h - PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER M: Tomasz Duszynski <tduszyns@gmail.com> S: Maintained diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 10912988c8f5..6b077ca937f6 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, if (bip->bip_vcnt > 0) { struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + if (bvec_try_merge_hw_page(q, bv, page, len, offset)) { bip->bip_iter.bi_size += len; return len; diff --git a/block/bio.c b/block/bio.c index 3c0a558c90f5..92c512e876c8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) - return false; if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) { if (IS_ENABLED(CONFIG_KMSAN)) @@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; @@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page, if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset)) { - bio->bi_iter.bi_size += len; - return len; + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_page(bv, page, len, offset)) { + bio->bi_iter.bi_size += len; + return len; + } } if (bio->bi_vcnt >= bio->bi_max_vecs) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 444798c5374f..705da074ad6c 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -12,16 +12,56 @@ #include <linux/cpu.h> #include <linux/group_cpus.h> #include <linux/device/bus.h> +#include <linux/sched/isolation.h> #include "blk.h" #include "blk-mq.h" +static unsigned int blk_mq_num_queues(const struct cpumask *mask, + unsigned int max_queues) +{ + unsigned int num; + + num = cpumask_weight(mask); + return min_not_zero(num, max_queues); +} + +/** + * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of possible CPUs. + */ +unsigned int blk_mq_num_possible_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_possible_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues); + +/** + * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices + * @max_queues: The maximum number of queues the hardware/driver + * supports. If max_queues is 0, the argument is + * ignored. + * + * Calculates the number of queues to be used for a multiqueue + * device based on the number of online CPUs. + */ +unsigned int blk_mq_num_online_queues(unsigned int max_queues) +{ + return blk_mq_num_queues(cpu_online_mask, max_queues); +} +EXPORT_SYMBOL_GPL(blk_mq_num_online_queues); + void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { const struct cpumask *masks; - unsigned int queue, cpu; + unsigned int queue, cpu, nr_masks; - masks = group_cpus_evenly(qmap->nr_queues); + masks = group_cpus_evenly(qmap->nr_queues, &nr_masks); if (!masks) { for_each_possible_cpu(cpu) qmap->mq_map[cpu] = qmap->queue_offset; @@ -29,7 +69,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap) } for (queue = 0; queue < qmap->nr_queues; queue++) { - for_each_cpu(cpu, &masks[queue]) + for_each_cpu(cpu, &masks[queue % nr_masks]) qmap->mq_map[cpu] = qmap->queue_offset + queue; } kfree(masks); diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 82bae475dfa4..ad283017caef 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include <linux/blk-mq-dma.h> #include "blk.h" struct phys_vec { @@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, return true; } +/* + * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page + * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so + * we need to ensure our segments are aligned to this as well. + * + * Note that there is no point in using the slightly more complicated IOVA based + * path for single segment mappings. + */ +static inline bool blk_can_dma_map_iova(struct request *req, + struct device *dma_dev) +{ + return !((queue_virt_boundary(req->q) + 1) & + dma_get_merge_boundary(dma_dev)); +} + +static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); + iter->len = vec->len; + return true; +} + +static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter, struct phys_vec *vec) +{ + iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), + offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + if (dma_mapping_error(dma_dev, iter->addr)) { + iter->status = BLK_STS_RESOURCE; + return false; + } + iter->len = vec->len; + return true; +} + +static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + struct phys_vec *vec) +{ + enum dma_data_direction dir = rq_dma_dir(req); + unsigned int mapped = 0; + int error; + + iter->addr = state->addr; + iter->len = dma_iova_size(state); + + do { + error = dma_iova_link(dma_dev, state, vec->paddr, mapped, + vec->len, dir, 0); + if (error) + break; + mapped += vec->len; + } while (blk_map_iter_next(req, &iter->iter, vec)); + + error = dma_iova_sync(dma_dev, state, 0, mapped); + if (error) { + iter->status = errno_to_blk_status(error); + return false; + } + + return true; +} + +/** + * blk_rq_dma_map_iter_start - map the first DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the + * caller and don't need to be initialized. @state needs to be stored for use + * at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + unsigned int total_len = blk_rq_payload_bytes(req); + struct phys_vec vec; + + iter->iter.bio = req->bio; + iter->iter.iter = req->bio->bi_iter; + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); + iter->status = BLK_STS_OK; + + /* + * Grab the first segment ASAP because we'll need it to check for P2P + * transfers. + */ + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + req->cmd_flags &= ~REQ_P2PDMA; + break; + default: + iter->status = BLK_STS_INVAL; + return false; + } + } + + if (blk_can_dma_map_iova(req, dma_dev) && + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); + +/** + * blk_rq_dma_map_iter_next - map the next DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next mapping after a previous call to + * blk_rq_dma_map_iter_start(). See there for a detailed description of the + * arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); + static inline struct scatterlist * blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 4806b867e37d..9692fa4c3ef2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -883,7 +883,8 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -982,7 +983,8 @@ bool blk_update_request(struct request *req, blk_status_t error, /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { - blk_zone_update_request_bio(req, bio); + if (blk_req_bio_is_zone_append(req, bio)) + blk_zone_append_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } @@ -3169,8 +3171,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; - if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) - goto queue_exit; + if (bio_needs_zone_write_plugging(bio)) { + if (blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + } new_request: if (rq) { @@ -4966,6 +4970,60 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +/* + * Switch back to the elevator type stored in the xarray. + */ +static void blk_mq_elv_switch_back(struct request_queue *q, + struct xarray *elv_tbl) +{ + struct elevator_type *e = xa_load(elv_tbl, q->id); + + /* The elv_update_nr_hw_queues unfreezes the queue. */ + elv_update_nr_hw_queues(q, e); + + /* Drop the reference acquired in blk_mq_elv_switch_none. */ + if (e) + elevator_put(e); +} + +/* + * Stores elevator type in xarray and set current elevator to none. It uses + * q->id as an index to store the elevator type into the xarray. + */ +static int blk_mq_elv_switch_none(struct request_queue *q, + struct xarray *elv_tbl) +{ + int ret = 0; + + lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); + + /* + * Accessing q->elevator without holding q->elevator_lock is safe here + * because we're called from nr_hw_queue update which is protected by + * set->update_nr_hwq_lock in the writer context. So, scheduler update/ + * switch code (which acquires the same lock in the reader context) + * can't run concurrently. + */ + if (q->elevator) { + + ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); + if (WARN_ON_ONCE(ret)) + return ret; + + /* + * Before we switch elevator to 'none', take a reference to + * the elevator module so that while nr_hw_queue update is + * running, no one can remove elevator module. We'd put the + * reference to elevator module later when we switch back + * elevator. + */ + __elevator_get(q->elevator->type); + + elevator_set_none(q); + } + return ret; +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { @@ -4973,6 +5031,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; + struct xarray elv_tbl; lockdep_assert_held(&set->tag_list_lock); @@ -4984,6 +5043,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; memflags = memalloc_noio_save(); + + xa_init(&elv_tbl); + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); @@ -4992,11 +5054,17 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue_nomemsave(q); - if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) { - list_for_each_entry(q, &set->tag_list, tag_set_list) - blk_mq_unfreeze_queue_nomemrestore(q); - goto reregister; - } + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (blk_mq_elv_switch_none(q, &elv_tbl)) + goto switch_back; + + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) + goto switch_back; fallback: blk_mq_update_queue_map(set); @@ -5016,12 +5084,11 @@ fallback: } blk_mq_map_swqueue(q); } - - /* elv_update_nr_hw_queues() unfreeze queue for us */ +switch_back: + /* The blk_mq_elv_switch_back unfreezes queue for us. */ list_for_each_entry(q, &set->tag_list, tag_set_list) - elv_update_nr_hw_queues(q); + blk_mq_elv_switch_back(q, &elv_tbl); -reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); @@ -5029,6 +5096,9 @@ reregister: blk_mq_remove_hw_queues_cpuhp(q); blk_mq_add_hw_queues_cpuhp(q); } + + xa_destroy(&elv_tbl); + memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk-settings.c b/block/blk-settings.c index e0cc7a1fe4ac..91449147bae9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -221,6 +221,8 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; + unsigned int atomic_write_hw_max_sectors = + lim->atomic_write_hw_max >> SECTOR_SHIFT; if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; @@ -242,6 +244,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) lim->atomic_write_hw_max)) goto unsupported; + if (WARN_ON_ONCE(lim->chunk_sectors && + atomic_write_hw_max_sectors > lim->chunk_sectors)) + goto unsupported; + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; if (boundary_sectors) { @@ -636,41 +642,50 @@ static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, return true; } - -/* Check stacking of first bottom device */ -static bool blk_stack_atomic_writes_head(struct queue_limits *t, - struct queue_limits *b) +static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) - return false; + unsigned int chunk_bytes; - if (t->io_min <= SECTOR_SIZE) { - /* No chunk sectors, so use bottom device values directly */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; - t->atomic_write_hw_max = b->atomic_write_hw_max; - return true; - } + if (!t->chunk_sectors) + return; + + /* + * If chunk sectors is so large that its value in bytes overflows + * UINT_MAX, then just shift it down so it definitely will fit. + * We don't support atomic writes of such a large size anyway. + */ + if (check_shl_overflow(t->chunk_sectors, SECTOR_SHIFT, &chunk_bytes)) + chunk_bytes = t->chunk_sectors; /* * Find values for limits which work for chunk size. * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk - * size (t->io_min), as chunk size is not restricted to a power-of-2. + * size, as the chunk size is not restricted to a power-of-2. * So we need to find highest power-of-2 which works for the chunk * size. - * As an example scenario, we could have b->unit_max = 16K and - * t->io_min = 24K. For this case, reduce t->unit_max to a value - * aligned with both limits, i.e. 8K in this example. + * As an example scenario, we could have t->unit_max = 16K and + * t->chunk_sectors = 24KB. For this case, reduce t->unit_max to a + * value aligned with both limits, i.e. 8K in this example. */ - t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; - while (t->io_min % t->atomic_write_hw_unit_max) - t->atomic_write_hw_unit_max /= 2; + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + max_pow_of_two_factor(chunk_bytes)); - t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_min = min(t->atomic_write_hw_unit_min, t->atomic_write_hw_unit_max); - t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); + t->atomic_write_hw_max = min(t->atomic_write_hw_max, chunk_bytes); +} +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) +{ + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; + + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; return true; } @@ -698,6 +713,7 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_stack_atomic_writes_head(t, b)) goto unsupported; + blk_stack_atomic_writes_chunk_sectors(t); return; unsupported: diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 351d659280e1..ef43aaca49f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -17,6 +17,8 @@ #include <linux/refcount.h> #include <linux/mempool.h> +#include <trace/events/block.h> + #include "blk.h" #include "blk-mq-sched.h" #include "blk-mq-debugfs.h" @@ -177,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev) struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); + trace_blkdev_zone_mgmt(&bio, 0); return submit_bio_wait(&bio); } @@ -240,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, cond_resched(); } + trace_blkdev_zone_mgmt(bio, nr_sectors); ret = submit_bio_wait(bio); bio_put(bio); @@ -818,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, * at the tail of the list to preserve the sequential write order. */ bio_list_add(&zwplug->bio_list, bio); + trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; @@ -1116,25 +1122,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { struct block_device *bdev = bio->bi_bdev; - if (!bdev->bd_disk->zone_wplugs_hash) - return false; - - /* - * If the BIO already has the plugging flag set, then it was already - * handled through this path and this is a submission from the zone - * plug bio submit work. - */ - if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) - return false; - - /* - * We do not need to do anything special for empty flush BIOs, e.g - * BIOs such as issued by blkdev_issue_flush(). The is because it is - * the responsibility of the user to first wait for the completion of - * write operations for flush to have any effect on the persistence of - * the written data. - */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) return false; /* @@ -1205,6 +1193,20 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_unlock_irqrestore(&zwplug->lock, flags); } +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) +{ + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. + */ + bio->bi_iter.bi_sector = rq->__sector; + trace_blk_zone_append_update_request_bio(rq); +} + void blk_zone_write_plug_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -1299,6 +1301,9 @@ again: goto put_zwplug; } + trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { blk_zone_wplug_bio_io_error(zwplug, bio); goto again; diff --git a/block/blk.h b/block/blk.h index 37ec459fe656..76901a39997f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,15 @@ struct elevator_type; +/* + * Default upper limit for the software max_sectors limit used for regular I/Os. + * This can be increased through sysfs. + * + * This should not be confused with the max_hw_sector limit that is entirely + * controlled by the block device driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP (SZ_4M >> SECTOR_SHIFT) + #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) #define BLK_MIN_SEGMENT_SIZE 4096 @@ -321,7 +330,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q); +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); @@ -467,23 +476,15 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); } -void blk_zone_write_plug_bio_merged(struct bio *bio); -void blk_zone_write_plug_init_request(struct request *rq); -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline bool blk_req_bio_is_zone_append(struct request *rq, + struct bio *bio) { - /* - * For zone append requests, the request sector indicates the location - * at which the BIO data was written. Return this value to the BIO - * issuer through the BIO iter sector. - * For plugged zone writes, which include emulated zone append, we need - * the original BIO sector so that blk_zone_write_plug_bio_endio() can - * lookup the zone write plug. - */ - if (req_op(rq) == REQ_OP_ZONE_APPEND || - bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) - bio->bi_iter.bi_sector = rq->__sector; + return req_op(rq) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); } +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_init_request(struct request *rq); +void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio); void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { @@ -516,14 +517,19 @@ static inline bool bio_zone_write_plugging(struct bio *bio) { return false; } +static inline bool blk_req_bio_is_zone_append(struct request *req, + struct bio *bio) +{ + return false; +} static inline void blk_zone_write_plug_bio_merged(struct bio *bio) { } static inline void blk_zone_write_plug_init_request(struct request *rq) { } -static inline void blk_zone_update_request_bio(struct request *rq, - struct bio *bio) +static inline void blk_zone_append_update_request_bio(struct request *rq, + struct bio *bio) { } static inline void blk_zone_bio_endio(struct bio *bio) diff --git a/block/elevator.c b/block/elevator.c index a960bdc869bc..88f8f36bed98 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -689,21 +689,21 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q) +void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e) { struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); - mutex_lock(&q->elevator_lock); - if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) { - ctx.name = q->elevator->type->elevator_name; + if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { + ctx.name = e->elevator_name; + mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ ret = elevator_switch(q, &ctx); + mutex_unlock(&q->elevator_lock); } - mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, &ctx)); diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 0f70e2374e7f..df38fb364904 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -256,49 +256,6 @@ config BLK_DEV_RAM_SIZE The default value is 4096 kilobytes. Only change this if you know what you are doing. -config CDROM_PKTCDVD - tristate "Packet writing on CD/DVD media (DEPRECATED)" - depends on !UML - depends on SCSI - select CDROM - help - Note: This driver is deprecated and will be removed from the - kernel in the near future! - - If you have a CDROM/DVD drive that supports packet writing, say - Y to include support. It should work with any MMC/Mt Fuji - compliant ATAPI or SCSI drive, which is just about any newer - DVD/CD writer. - - Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs - is possible. - DVD-RW disks must be in restricted overwrite mode. - - See the file <file:Documentation/cdrom/packet-writing.rst> - for further information on the use of this driver. - - To compile this driver as a module, choose M here: the - module will be called pktcdvd. - -config CDROM_PKTCDVD_BUFFERS - int "Free buffers for data gathering" - depends on CDROM_PKTCDVD - default "8" - help - This controls the maximum number of active concurrent packets. More - concurrent packets can increase write performance, but also require - more memory. Each concurrent packet will require approximately 64Kb - of non-swappable kernel memory, memory which will be allocated when - a disc is opened for writing. - -config CDROM_PKTCDVD_WCACHE - bool "Enable write caching" - depends on CDROM_PKTCDVD - help - If enabled, write caching will be set for the CD-R/W device. For now - this option is dangerous unless the CD-RW media is known good, as we - don't do deferred write error handling yet. - config ATA_OVER_ETH tristate "ATA over Ethernet support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 097707aca725..a695ce74ef22 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_N64CART) += n64cart.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e5a2e5f7887b..975024cf03c5 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2500,7 +2500,11 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); + /* put is in drbd_send_acks_wf() */ + kref_get(&device->kref); + if (!queue_work(connection->ack_sender, + &peer_req->peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); err = -ENOENT; goto out; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index e97432032f01..24be0c2c4075 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3411,7 +3411,7 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, struct floppy_max_errors max_errors; struct floppy_drive_params dp; } inparam; /* parameters coming from user space */ - const void *outparam; /* parameters passed back to user space */ + const void *outparam = NULL; /* parameters passed back to user space */ /* convert compatibility eject ioctls into floppy eject ioctl. * We do this in order to provide a means to eject floppy disks before diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 8d994cae3b83..1b6ee91f8eb9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1431,17 +1431,34 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) return 0; } -static int loop_set_block_size(struct loop_device *lo, unsigned long arg) +static int loop_set_block_size(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, unsigned long arg) { struct queue_limits lim; unsigned int memflags; int err = 0; - if (lo->lo_state != Lo_bound) - return -ENXIO; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_block_size, NULL); + if (err) + return err; + } + + err = mutex_lock_killable(&lo->lo_mutex); + if (err) + goto abort_claim; + + if (lo->lo_state != Lo_bound) { + err = -ENXIO; + goto unlock; + } if (lo->lo_queue->limits.logical_block_size == arg) - return 0; + goto unlock; sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); @@ -1454,6 +1471,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); +unlock: + mutex_unlock(&lo->lo_mutex); +abort_claim: + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_block_size); return err; } @@ -1472,9 +1494,6 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, case LOOP_SET_DIRECT_IO: err = loop_set_dio(lo, arg); break; - case LOOP_SET_BLOCK_SIZE: - err = loop_set_block_size(lo, arg); - break; default: err = -EINVAL; } @@ -1529,9 +1548,12 @@ static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); + case LOOP_SET_BLOCK_SIZE: + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return loop_set_block_size(lo, mode, bdev, arg); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: - case LOOP_SET_BLOCK_SIZE: if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 66ce6b81c7d9..8fc7761397bd 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2040,11 +2040,12 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, * @dir Direction (read or write) * * return value - * None + * 0 The IO completed successfully. + * -ENOMEM The DMA mapping failed. */ -static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, - struct mtip_cmd *command, - struct blk_mq_hw_ctx *hctx) +static int mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, + struct blk_mq_hw_ctx *hctx) { struct mtip_cmd_hdr *hdr = dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag; @@ -2056,12 +2057,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(rq, command->sg); - nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + command->scatter_ents = blk_rq_map_sg(rq, command->sg); + nents = dma_map_sg(&dd->pdev->dev, command->sg, + command->scatter_ents, dma_dir); + if (!nents) + return -ENOMEM; - prefetch(&port->flags); - command->scatter_ents = nents; + prefetch(&port->flags); /* * The number of retries for this command before it is @@ -2112,11 +2115,13 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { set_bit(rq->tag, port->cmds_to_issue); set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); - return; + return 0; } /* Issue the command to the hardware */ mtip_issue_ncq_command(port, rq->tag); + + return 0; } /* @@ -3315,7 +3320,9 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - mtip_hw_submit_io(dd, rq, cmd, hctx); + if (mtip_hw_submit_io(dd, rq, cmd, hctx)) + return BLK_STS_IOERR; + return BLK_STS_OK; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2592bd19ebc1..6463d0e8d0ce 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1473,7 +1473,17 @@ static int nbd_start_device(struct nbd_device *nbd) return -EINVAL; } - blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); +retry: + mutex_unlock(&nbd->config_lock); + blk_mq_update_nr_hw_queues(&nbd->tag_set, num_connections); + mutex_lock(&nbd->config_lock); + + /* if another code path updated nr_hw_queues, retry until succeed */ + if (num_connections != config->num_connections) { + num_connections = config->num_connections; + goto retry; + } + nbd->pid = task_pid_nr(current); nbd_parse_flags(nbd); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c deleted file mode 100644 index d5cc7bd2875c..000000000000 --- a/drivers/block/pktcdvd.c +++ /dev/null @@ -1,2916 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe <axboe@suse.de> - * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com> - * Copyright (C) 2006 Thomas Maier <balagi@justmail.de> - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and - * DVD-RAM devices. - * - * Theory of operation: - * - * At the lowest level, there is the standard driver for the CD/DVD device, - * such as drivers/scsi/sr.c. This driver can handle read and write requests, - * but it doesn't know anything about the special restrictions that apply to - * packet writing. One restriction is that write requests must be aligned to - * packet boundaries on the physical media, and the size of a write request - * must be equal to the packet size. Another restriction is that a - * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read - * command, if the previous command was a write. - * - * The purpose of the packet writing driver is to hide these restrictions from - * higher layers, such as file systems, and present a block device that can be - * randomly read and written using 2kB-sized blocks. - * - * The lowest layer in the packet writing driver is the packet I/O scheduler. - * Its data is defined by the struct packet_iosched and includes two bio - * queues with pending read and write requests. These queues are processed - * by the pkt_iosched_process_queue() function. The write requests in this - * queue are already properly aligned and sized. This layer is responsible for - * issuing the flush cache commands and scheduling the I/O in a good order. - * - * The next layer transforms unaligned write requests to aligned writes. This - * transformation requires reading missing pieces of data from the underlying - * block device, assembling the pieces to full packets and queuing them to the - * packet I/O scheduler. - * - * At the top layer there is a custom ->submit_bio function that forwards - * read requests directly to the iosched queue and puts write requests in the - * unaligned write queue. A kernel thread performs the necessary read - * gathering to convert the unaligned writes to aligned writes and then feeds - * them to the packet I/O scheduler. - * - *************************************************************************/ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/backing-dev.h> -#include <linux/compat.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/errno.h> -#include <linux/file.h> -#include <linux/freezer.h> -#include <linux/kernel.h> -#include <linux/kthread.h> -#include <linux/miscdevice.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/nospec.h> -#include <linux/pktcdvd.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/uaccess.h> - -#include <scsi/scsi.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_ioctl.h> - -#include <linux/unaligned.h> - -#define DRIVER_NAME "pktcdvd" - -#define MAX_SPEED 0xffff - -static DEFINE_MUTEX(pktcdvd_mutex); -static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; -static struct proc_dir_entry *pkt_proc; -static int pktdev_major; -static int write_congestion_on = PKT_WRITE_CONGESTION_ON; -static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; -static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ -static mempool_t psd_pool; -static struct bio_set pkt_bio_set; - -/* /sys/class/pktcdvd */ -static struct class class_pktcdvd; -static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ - -/* forward declaration */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); -static int pkt_remove_dev(dev_t pkt_dev); - -static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) -{ - return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); -} - -/********************************************************** - * sysfs interface for pktcdvd - * by (C) 2006 Thomas Maier <balagi@justmail.de> - - /sys/class/pktcdvd/pktcdvd[0-7]/ - stat/reset - stat/packets_started - stat/packets_finished - stat/kb_written - stat/kb_read - stat/kb_read_gather - write_queue/size - write_queue/congestion_off - write_queue/congestion_on - **********************************************************/ - -static ssize_t packets_started_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started); -} -static DEVICE_ATTR_RO(packets_started); - -static ssize_t packets_finished_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended); -} -static DEVICE_ATTR_RO(packets_finished); - -static ssize_t kb_written_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1); -} -static DEVICE_ATTR_RO(kb_written); - -static ssize_t kb_read_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1); -} -static DEVICE_ATTR_RO(kb_read); - -static ssize_t kb_read_gather_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1); -} -static DEVICE_ATTR_RO(kb_read_gather); - -static ssize_t reset_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - - if (len > 0) { - pd->stats.pkt_started = 0; - pd->stats.pkt_ended = 0; - pd->stats.secs_w = 0; - pd->stats.secs_rg = 0; - pd->stats.secs_r = 0; - } - return len; -} -static DEVICE_ATTR_WO(reset); - -static struct attribute *pkt_stat_attrs[] = { - &dev_attr_packets_finished.attr, - &dev_attr_packets_started.attr, - &dev_attr_kb_read.attr, - &dev_attr_kb_written.attr, - &dev_attr_kb_read_gather.attr, - &dev_attr_reset.attr, - NULL, -}; - -static const struct attribute_group pkt_stat_group = { - .name = "stat", - .attrs = pkt_stat_attrs, -}; - -static ssize_t size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->bio_queue_size); - spin_unlock(&pd->lock); - return n; -} -static DEVICE_ATTR_RO(size); - -static void init_write_congestion_marks(int* lo, int* hi) -{ - if (*hi > 0) { - *hi = max(*hi, 500); - *hi = min(*hi, 1000000); - if (*lo <= 0) - *lo = *hi - 100; - else { - *lo = min(*lo, *hi - 100); - *lo = max(*lo, 100); - } - } else { - *hi = -1; - *lo = -1; - } -} - -static ssize_t congestion_off_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_off); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_off_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val, ret; - - ret = kstrtoint(buf, 10, &val); - if (ret) - return ret; - - spin_lock(&pd->lock); - pd->write_congestion_off = val; - init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); - spin_unlock(&pd->lock); - return len; -} -static DEVICE_ATTR_RW(congestion_off); - -static ssize_t congestion_on_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int n; - - spin_lock(&pd->lock); - n = sysfs_emit(buf, "%d\n", pd->write_congestion_on); - spin_unlock(&pd->lock); - return n; -} - -static ssize_t congestion_on_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) -{ - struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val, ret; - - ret = kstrtoint(buf, 10, &val); - if (ret) - return ret; - - spin_lock(&pd->lock); - pd->write_congestion_on = val; - init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); - spin_unlock(&pd->lock); - return len; -} -static DEVICE_ATTR_RW(congestion_on); - -static struct attribute *pkt_wq_attrs[] = { - &dev_attr_congestion_on.attr, - &dev_attr_congestion_off.attr, - &dev_attr_size.attr, - NULL, -}; - -static const struct attribute_group pkt_wq_group = { - .name = "write_queue", - .attrs = pkt_wq_attrs, -}; - -static const struct attribute_group *pkt_groups[] = { - &pkt_stat_group, - &pkt_wq_group, - NULL, -}; - -static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) -{ - if (class_is_registered(&class_pktcdvd)) { - pd->dev = device_create_with_groups(&class_pktcdvd, NULL, - MKDEV(0, 0), pd, pkt_groups, - "%s", pd->disk->disk_name); - if (IS_ERR(pd->dev)) - pd->dev = NULL; - } -} - -static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) -{ - if (class_is_registered(&class_pktcdvd)) - device_unregister(pd->dev); -} - - -/******************************************************************** - /sys/class/pktcdvd/ - add map block device - remove unmap packet dev - device_map show mappings - *******************************************************************/ - -static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr, - char *data) -{ - int n = 0; - int idx; - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - for (idx = 0; idx < MAX_WRITERS; idx++) { - struct pktcdvd_device *pd = pkt_devs[idx]; - if (!pd) - continue; - n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n", - pd->disk->disk_name, - MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), - MAJOR(file_bdev(pd->bdev_file)->bd_dev), - MINOR(file_bdev(pd->bdev_file)->bd_dev)); - } - mutex_unlock(&ctl_mutex); - return n; -} -static CLASS_ATTR_RO(device_map); - -static ssize_t add_store(const struct class *c, const struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - /* pkt_setup_dev() expects caller to hold reference to self */ - if (!try_module_get(THIS_MODULE)) - return -ENODEV; - - pkt_setup_dev(MKDEV(major, minor), NULL); - - module_put(THIS_MODULE); - - return count; - } - - return -EINVAL; -} -static CLASS_ATTR_WO(add); - -static ssize_t remove_store(const struct class *c, const struct class_attribute *attr, - const char *buf, size_t count) -{ - unsigned int major, minor; - if (sscanf(buf, "%u:%u", &major, &minor) == 2) { - pkt_remove_dev(MKDEV(major, minor)); - return count; - } - return -EINVAL; -} -static CLASS_ATTR_WO(remove); - -static struct attribute *class_pktcdvd_attrs[] = { - &class_attr_add.attr, - &class_attr_remove.attr, - &class_attr_device_map.attr, - NULL, -}; -ATTRIBUTE_GROUPS(class_pktcdvd); - -static struct class class_pktcdvd = { - .name = DRIVER_NAME, - .class_groups = class_pktcdvd_groups, -}; - -static int pkt_sysfs_init(void) -{ - /* - * create control files in sysfs - * /sys/class/pktcdvd/... - */ - return class_register(&class_pktcdvd); -} - -static void pkt_sysfs_cleanup(void) -{ - class_unregister(&class_pktcdvd); -} - -/******************************************************************** - entries in debugfs - - /sys/kernel/debug/pktcdvd[0-7]/ - info - - *******************************************************************/ - -static void pkt_count_states(struct pktcdvd_device *pd, int *states) -{ - struct packet_data *pkt; - int i; - - for (i = 0; i < PACKET_NUM_STATES; i++) - states[i] = 0; - - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - states[pkt->state]++; - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -static int pkt_seq_show(struct seq_file *m, void *p) -{ - struct pktcdvd_device *pd = m->private; - char *msg; - int states[PACKET_NUM_STATES]; - - seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, - file_bdev(pd->bdev_file)); - - seq_printf(m, "\nSettings:\n"); - seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); - - if (pd->settings.write_type == 0) - msg = "Packet"; - else - msg = "Unknown"; - seq_printf(m, "\twrite type:\t\t%s\n", msg); - - seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); - seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); - - seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); - - if (pd->settings.block_mode == PACKET_BLOCK_MODE1) - msg = "Mode 1"; - else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) - msg = "Mode 2"; - else - msg = "Unknown"; - seq_printf(m, "\tblock mode:\t\t%s\n", msg); - - seq_printf(m, "\nStatistics:\n"); - seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); - seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); - seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); - seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); - seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); - - seq_printf(m, "\nMisc:\n"); - seq_printf(m, "\treference count:\t%d\n", pd->refcnt); - seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); - seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); - seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); - seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); - seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); - - seq_printf(m, "\nQueue state:\n"); - seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); - seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); - seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", pd->current_sector); - - pkt_count_states(pd, states); - seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", - pd->write_congestion_off, - pd->write_congestion_on); - return 0; -} -DEFINE_SHOW_ATTRIBUTE(pkt_seq); - -static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); - - pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, - pd, &pkt_seq_fops); -} - -static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) -{ - if (!pkt_debugfs_root) - return; - debugfs_remove(pd->dfs_f_info); - debugfs_remove(pd->dfs_d_root); - pd->dfs_f_info = NULL; - pd->dfs_d_root = NULL; -} - -static void pkt_debugfs_init(void) -{ - pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); -} - -static void pkt_debugfs_cleanup(void) -{ - debugfs_remove(pkt_debugfs_root); - pkt_debugfs_root = NULL; -} - -/* ----------------------------------------------------------*/ - - -static void pkt_bio_finished(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - - BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); - if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - dev_dbg(ddev, "queue empty\n"); - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); - } -} - -/* - * Allocate a packet_data struct - */ -static struct packet_data *pkt_alloc_packet_data(int frames) -{ - int i; - struct packet_data *pkt; - - pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL); - if (!pkt) - goto no_pkt; - - pkt->frames = frames; - pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL); - if (!pkt->w_bio) - goto no_bio; - - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { - pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!pkt->pages[i]) - goto no_page; - } - - spin_lock_init(&pkt->lock); - bio_list_init(&pkt->orig_bios); - - for (i = 0; i < frames; i++) { - pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL); - if (!pkt->r_bios[i]) - goto no_rd_bio; - } - - return pkt; - -no_rd_bio: - for (i = 0; i < frames; i++) - kfree(pkt->r_bios[i]); -no_page: - for (i = 0; i < frames / FRAMES_PER_PAGE; i++) - if (pkt->pages[i]) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); -no_bio: - kfree(pkt); -no_pkt: - return NULL; -} - -/* - * Free a packet_data struct - */ -static void pkt_free_packet_data(struct packet_data *pkt) -{ - int i; - - for (i = 0; i < pkt->frames; i++) - kfree(pkt->r_bios[i]); - for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) - __free_page(pkt->pages[i]); - kfree(pkt->w_bio); - kfree(pkt); -} - -static void pkt_shrink_pktlist(struct pktcdvd_device *pd) -{ - struct packet_data *pkt, *next; - - BUG_ON(!list_empty(&pd->cdrw.pkt_active_list)); - - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { - pkt_free_packet_data(pkt); - } - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); -} - -static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) -{ - struct packet_data *pkt; - - BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); - - while (nr_packets > 0) { - pkt = pkt_alloc_packet_data(pd->settings.size >> 2); - if (!pkt) { - pkt_shrink_pktlist(pd); - return 0; - } - pkt->id = nr_packets; - pkt->pd = pd; - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - nr_packets--; - } - return 1; -} - -static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) -{ - struct rb_node *n = rb_next(&node->rb_node); - if (!n) - return NULL; - return rb_entry(n, struct pkt_rb_node, rb_node); -} - -static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - rb_erase(&node->rb_node, &pd->bio_queue); - mempool_free(node, &pd->rb_pool); - pd->bio_queue_size--; - BUG_ON(pd->bio_queue_size < 0); -} - -/* - * Find the first node in the pd->bio_queue rb tree with a starting sector >= s. - */ -static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s) -{ - struct rb_node *n = pd->bio_queue.rb_node; - struct rb_node *next; - struct pkt_rb_node *tmp; - - if (!n) { - BUG_ON(pd->bio_queue_size > 0); - return NULL; - } - - for (;;) { - tmp = rb_entry(n, struct pkt_rb_node, rb_node); - if (s <= tmp->bio->bi_iter.bi_sector) - next = n->rb_left; - else - next = n->rb_right; - if (!next) - break; - n = next; - } - - if (s > tmp->bio->bi_iter.bi_sector) { - tmp = pkt_rbtree_next(tmp); - if (!tmp) - return NULL; - } - BUG_ON(s > tmp->bio->bi_iter.bi_sector); - return tmp; -} - -/* - * Insert a node into the pd->bio_queue rb tree. - */ -static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node) -{ - struct rb_node **p = &pd->bio_queue.rb_node; - struct rb_node *parent = NULL; - sector_t s = node->bio->bi_iter.bi_sector; - struct pkt_rb_node *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct pkt_rb_node, rb_node); - if (s < tmp->bio->bi_iter.bi_sector) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&node->rb_node, parent, p); - rb_insert_color(&node->rb_node, &pd->bio_queue); - pd->bio_queue_size++; -} - -/* - * Send a packet_command to the underlying block device and - * wait for completion. - */ -static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file)); - struct scsi_cmnd *scmd; - struct request *rq; - int ret = 0; - - rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - scmd = blk_mq_rq_to_pdu(rq); - - if (cgc->buflen) { - ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen, - GFP_NOIO); - if (ret) - goto out; - } - - scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]); - memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE); - - rq->timeout = 60*HZ; - if (cgc->quiet) - rq->rq_flags |= RQF_QUIET; - - blk_execute_rq(rq, false); - if (scmd->result) - ret = -EIO; -out: - blk_mq_free_request(rq); - return ret; -} - -static const char *sense_key_string(__u8 index) -{ - static const char * const info[] = { - "No sense", "Recovered error", "Not ready", - "Medium error", "Hardware error", "Illegal request", - "Unit attention", "Data protect", "Blank check", - }; - - return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; -} - -/* - * A generic sense dump / resolve mechanism should be implemented across - * all ATAPI + SCSI devices. - */ -static void pkt_dump_sense(struct pktcdvd_device *pd, - struct packet_command *cgc) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct scsi_sense_hdr *sshdr = cgc->sshdr; - - if (sshdr) - dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n", - CDROM_PACKET_SIZE, cgc->cmd, - sshdr->sense_key, sshdr->asc, sshdr->ascq, - sense_key_string(sshdr->sense_key)); - else - dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); -} - -/* - * flush the drive cache to media - */ -static int pkt_flush_cache(struct pktcdvd_device *pd) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_FLUSH_CACHE; - cgc.quiet = 1; - - /* - * the IMMED bit -- we default to not setting it, although that - * would allow a much faster close, this is safer - */ -#if 0 - cgc.cmd[1] = 1 << 1; -#endif - return pkt_generic_packet(pd, &cgc); -} - -/* - * speed is given as the normal factor, e.g. 4 for 4x - */ -static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, - unsigned write_speed, unsigned read_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_SET_SPEED; - put_unaligned_be16(read_speed, &cgc.cmd[2]); - put_unaligned_be16(write_speed, &cgc.cmd[4]); - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - - return ret; -} - -/* - * Queue a bio for processing by the low-level CD device. Must be called - * from process context. - */ -static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) -{ - /* - * Some CDRW drives can not handle writes larger than one packet, - * even if the size is a multiple of the packet size. - */ - bio->bi_opf |= REQ_NOMERGE; - - spin_lock(&pd->iosched.lock); - if (bio_data_dir(bio) == READ) - bio_list_add(&pd->iosched.read_queue, bio); - else - bio_list_add(&pd->iosched.write_queue, bio); - spin_unlock(&pd->iosched.lock); - - atomic_set(&pd->iosched.attention, 1); - wake_up(&pd->wqueue); -} - -/* - * Process the queued read/write requests. This function handles special - * requirements for CDRW drives: - * - A cache flush command must be inserted before a read request if the - * previous request was a write. - * - Switching between reading and writing is slow, so don't do it more often - * than necessary. - * - Optimize for throughput at the expense of latency. This means that streaming - * writes will never be interrupted by a read, but if the drive has to seek - * before the next write, switch to reading instead if there are any pending - * read requests. - * - Set the read speed according to current usage pattern. When only reading - * from the device, it's best to use the highest possible read speed, but - * when switching often between reading and writing, it's better to have the - * same read and write speeds. - */ -static void pkt_iosched_process_queue(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if (atomic_read(&pd->iosched.attention) == 0) - return; - atomic_set(&pd->iosched.attention, 0); - - for (;;) { - struct bio *bio; - int reads_queued, writes_queued; - - spin_lock(&pd->iosched.lock); - reads_queued = !bio_list_empty(&pd->iosched.read_queue); - writes_queued = !bio_list_empty(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - - if (!reads_queued && !writes_queued) - break; - - if (pd->iosched.writing) { - int need_write_seek = 1; - spin_lock(&pd->iosched.lock); - bio = bio_list_peek(&pd->iosched.write_queue); - spin_unlock(&pd->iosched.lock); - if (bio && (bio->bi_iter.bi_sector == - pd->iosched.last_write)) - need_write_seek = 0; - if (need_write_seek && reads_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - dev_dbg(ddev, "write, waiting\n"); - break; - } - pkt_flush_cache(pd); - pd->iosched.writing = 0; - } - } else { - if (!reads_queued && writes_queued) { - if (atomic_read(&pd->cdrw.pending_bios) > 0) { - dev_dbg(ddev, "read, waiting\n"); - break; - } - pd->iosched.writing = 1; - } - } - - spin_lock(&pd->iosched.lock); - if (pd->iosched.writing) - bio = bio_list_pop(&pd->iosched.write_queue); - else - bio = bio_list_pop(&pd->iosched.read_queue); - spin_unlock(&pd->iosched.lock); - - if (!bio) - continue; - - if (bio_data_dir(bio) == READ) - pd->iosched.successive_reads += - bio->bi_iter.bi_size >> 10; - else { - pd->iosched.successive_reads = 0; - pd->iosched.last_write = bio_end_sector(bio); - } - if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { - if (pd->read_speed == pd->write_speed) { - pd->read_speed = MAX_SPEED; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } else { - if (pd->read_speed != pd->write_speed) { - pd->read_speed = pd->write_speed; - pkt_set_speed(pd, pd->write_speed, pd->read_speed); - } - } - - atomic_inc(&pd->cdrw.pending_bios); - submit_bio_noacct(bio); - } -} - -/* - * Special care is needed if the underlying block device has a small - * max_phys_segments value. - */ -static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) { - /* - * The cdrom device can handle one segment/frame - */ - clear_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } - - if ((pd->settings.size << 9) / PAGE_SIZE <= queue_max_segments(q)) { - /* - * We can handle this case at the expense of some extra memory - * copies during write operations - */ - set_bit(PACKET_MERGE_SEGS, &pd->flags); - return 0; - } - - dev_err(ddev, "cdrom max_phys_segments too small\n"); - return -EIO; -} - -static void pkt_end_io_read(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n", - bio, pkt->sector, bio->bi_iter.bi_sector, bio->bi_status); - - if (bio->bi_status) - atomic_inc(&pkt->io_errors); - bio_uninit(bio); - if (atomic_dec_and_test(&pkt->io_wait)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - pkt_bio_finished(pd); -} - -static void pkt_end_io_packet_write(struct bio *bio) -{ - struct packet_data *pkt = bio->bi_private; - struct pktcdvd_device *pd = pkt->pd; - BUG_ON(!pd); - - dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status); - - pd->stats.pkt_ended++; - - bio_uninit(bio); - pkt_bio_finished(pd); - atomic_dec(&pkt->io_wait); - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); -} - -/* - * Schedule reads for the holes in a packet - */ -static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - int frames_read = 0; - struct bio *bio; - int f; - char written[PACKET_MAX_SIZE]; - - BUG_ON(bio_list_empty(&pkt->orig_bios)); - - atomic_set(&pkt->io_wait, 0); - atomic_set(&pkt->io_errors, 0); - - /* - * Figure out which frames we need to read before we can write. - */ - memset(written, 0, sizeof(written)); - spin_lock(&pkt->lock); - bio_list_for_each(bio, &pkt->orig_bios) { - int first_frame = (bio->bi_iter.bi_sector - pkt->sector) / - (CD_FRAMESIZE >> 9); - int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE; - pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); - BUG_ON(first_frame < 0); - BUG_ON(first_frame + num_frames > pkt->frames); - for (f = first_frame; f < first_frame + num_frames; f++) - written[f] = 1; - } - spin_unlock(&pkt->lock); - - if (pkt->cache_valid) { - dev_dbg(ddev, "zone %llx cached\n", pkt->sector); - goto out_account; - } - - /* - * Schedule reads for missing parts of the packet. - */ - for (f = 0; f < pkt->frames; f++) { - int p, offset; - - if (written[f]) - continue; - - bio = pkt->r_bios[f]; - bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1, - REQ_OP_READ); - bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); - bio->bi_end_io = pkt_end_io_read; - bio->bi_private = pkt; - - p = (f * CD_FRAMESIZE) / PAGE_SIZE; - offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f, - pkt->pages[p], offset); - if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) - BUG(); - - atomic_inc(&pkt->io_wait); - pkt_queue_bio(pd, bio); - frames_read++; - } - -out_account: - dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, pkt->sector); - pd->stats.pkt_started++; - pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); -} - -/* - * Find a packet matching zone, or the least recently used packet if - * there is no match. - */ -static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone) -{ - struct packet_data *pkt; - - list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) { - if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) { - list_del_init(&pkt->list); - if (pkt->sector != zone) - pkt->cache_valid = 0; - return pkt; - } - } - BUG(); - return NULL; -} - -static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - if (pkt->cache_valid) { - list_add(&pkt->list, &pd->cdrw.pkt_free_list); - } else { - list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list); - } -} - -static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt, - enum packet_data_state state) -{ - static const char *state_name[] = { - "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" - }; - enum packet_data_state old_state = pkt->state; - - dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n", - pkt->id, pkt->sector, state_name[old_state], state_name[state]); - - pkt->state = state; -} - -/* - * Scan the work queue to see if we can start a new packet. - * returns non-zero if any work was done. - */ -static int pkt_handle_queue(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt, *p; - struct bio *bio = NULL; - sector_t zone = 0; /* Suppress gcc warning */ - struct pkt_rb_node *node, *first_node; - struct rb_node *n; - - atomic_set(&pd->scan_queue, 0); - - if (list_empty(&pd->cdrw.pkt_free_list)) { - dev_dbg(ddev, "no pkt\n"); - return 0; - } - - /* - * Try to find a zone we are not already working on. - */ - spin_lock(&pd->lock); - first_node = pkt_rbtree_find(pd, pd->current_sector); - if (!first_node) { - n = rb_first(&pd->bio_queue); - if (n) - first_node = rb_entry(n, struct pkt_rb_node, rb_node); - } - node = first_node; - while (node) { - bio = node->bio; - zone = get_zone(bio->bi_iter.bi_sector, pd); - list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { - if (p->sector == zone) { - bio = NULL; - goto try_next_bio; - } - } - break; -try_next_bio: - node = pkt_rbtree_next(node); - if (!node) { - n = rb_first(&pd->bio_queue); - if (n) - node = rb_entry(n, struct pkt_rb_node, rb_node); - } - if (node == first_node) - node = NULL; - } - spin_unlock(&pd->lock); - if (!bio) { - dev_dbg(ddev, "no bio\n"); - return 0; - } - - pkt = pkt_get_packet_data(pd, zone); - - pd->current_sector = zone + pd->settings.size; - pkt->sector = zone; - BUG_ON(pkt->frames != pd->settings.size >> 2); - pkt->write_size = 0; - - /* - * Scan work queue for bios in the same zone and link them - * to this packet. - */ - spin_lock(&pd->lock); - dev_dbg(ddev, "looking for zone %llx\n", zone); - while ((node = pkt_rbtree_find(pd, zone)) != NULL) { - sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd); - - bio = node->bio; - dev_dbg(ddev, "found zone=%llx\n", tmp); - if (tmp != zone) - break; - pkt_rbtree_erase(pd, node); - spin_lock(&pkt->lock); - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE; - spin_unlock(&pkt->lock); - } - /* check write congestion marks, and if bio_queue_size is - * below, wake up any waiters - */ - if (pd->congested && - pd->bio_queue_size <= pd->write_congestion_off) { - pd->congested = false; - wake_up_var(&pd->congested); - } - spin_unlock(&pd->lock); - - pkt->sleep_time = max(PACKET_WAIT_TIME, 1); - pkt_set_state(ddev, pkt, PACKET_WAITING_STATE); - atomic_set(&pkt->run_sm, 1); - - spin_lock(&pd->cdrw.active_list_lock); - list_add(&pkt->list, &pd->cdrw.pkt_active_list); - spin_unlock(&pd->cdrw.active_list_lock); - - return 1; -} - -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -static void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} - -/* - * Assemble a bio to write one packet and queue the bio for processing - * by the underlying block device. - */ -static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - int f; - - bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs, - pkt->frames, REQ_OP_WRITE); - pkt->w_bio->bi_iter.bi_sector = pkt->sector; - pkt->w_bio->bi_end_io = pkt_end_io_packet_write; - pkt->w_bio->bi_private = pkt; - - /* XXX: locking? */ - for (f = 0; f < pkt->frames; f++) { - struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; - unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - - if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) - BUG(); - } - dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt); - - /* - * Fill-in bvec with data from orig_bios. - */ - spin_lock(&pkt->lock); - bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); - - pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE); - spin_unlock(&pkt->lock); - - dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, pkt->sector); - - if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) - pkt->cache_valid = 1; - else - pkt->cache_valid = 0; - - /* Start the write request */ - atomic_set(&pkt->io_wait, 1); - pkt_queue_bio(pd, pkt->w_bio); -} - -static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) -{ - struct bio *bio; - - if (status) - pkt->cache_valid = 0; - - /* Finish all bios corresponding to this packet */ - while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_status = status; - bio_endio(bio); - } -} - -static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) -{ - struct device *ddev = disk_to_dev(pd->disk); - - dev_dbg(ddev, "pkt %d\n", pkt->id); - - for (;;) { - switch (pkt->state) { - case PACKET_WAITING_STATE: - if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0)) - return; - - pkt->sleep_time = 0; - pkt_gather_data(pd, pkt); - pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE); - break; - - case PACKET_READ_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (atomic_read(&pkt->io_errors) > 0) { - pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); - } else { - pkt_start_write(pd, pkt); - } - break; - - case PACKET_WRITE_WAIT_STATE: - if (atomic_read(&pkt->io_wait) > 0) - return; - - if (!pkt->w_bio->bi_status) { - pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); - } else { - pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); - } - break; - - case PACKET_RECOVERY_STATE: - dev_dbg(ddev, "No recovery possible\n"); - pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); - break; - - case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_status); - return; - - default: - BUG(); - break; - } - } -} - -static void pkt_handle_packets(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt, *next; - - /* - * Run state machine for active packets - */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) { - atomic_set(&pkt->run_sm, 0); - pkt_run_state_machine(pd, pkt); - } - } - - /* - * Move no longer active packets to the free list - */ - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) { - if (pkt->state == PACKET_FINISHED_STATE) { - list_del(&pkt->list); - pkt_put_packet_data(pd, pkt); - pkt_set_state(ddev, pkt, PACKET_IDLE_STATE); - atomic_set(&pd->scan_queue, 1); - } - } - spin_unlock(&pd->cdrw.active_list_lock); -} - -/* - * kcdrwd is woken up when writes have been queued for one of our - * registered devices - */ -static int kcdrwd(void *foobar) -{ - struct pktcdvd_device *pd = foobar; - struct device *ddev = disk_to_dev(pd->disk); - struct packet_data *pkt; - int states[PACKET_NUM_STATES]; - long min_sleep_time, residue; - - set_user_nice(current, MIN_NICE); - set_freezable(); - - for (;;) { - DECLARE_WAITQUEUE(wait, current); - - /* - * Wait until there is something to do - */ - add_wait_queue(&pd->wqueue, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - /* Check if we need to run pkt_handle_queue */ - if (atomic_read(&pd->scan_queue) > 0) - goto work_to_do; - - /* Check if we need to run the state machine for some packet */ - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (atomic_read(&pkt->run_sm) > 0) - goto work_to_do; - } - - /* Check if we need to process the iosched queues */ - if (atomic_read(&pd->iosched.attention) != 0) - goto work_to_do; - - /* Otherwise, go to sleep */ - pkt_count_states(pd, states); - dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - min_sleep_time = MAX_SCHEDULE_TIMEOUT; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sleep_time && pkt->sleep_time < min_sleep_time) - min_sleep_time = pkt->sleep_time; - } - - dev_dbg(ddev, "sleeping\n"); - residue = schedule_timeout(min_sleep_time); - dev_dbg(ddev, "wake up\n"); - - /* make swsusp happy with our thread */ - try_to_freeze(); - - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (!pkt->sleep_time) - continue; - pkt->sleep_time -= min_sleep_time - residue; - if (pkt->sleep_time <= 0) { - pkt->sleep_time = 0; - atomic_inc(&pkt->run_sm); - } - } - - if (kthread_should_stop()) - break; - } -work_to_do: - set_current_state(TASK_RUNNING); - remove_wait_queue(&pd->wqueue, &wait); - - if (kthread_should_stop()) - break; - - /* - * if pkt_handle_queue returns true, we can queue - * another request. - */ - while (pkt_handle_queue(pd)) - ; - - /* - * Handle packet state machine - */ - pkt_handle_packets(pd); - - /* - * Handle iosched queues - */ - pkt_iosched_process_queue(pd); - } - - return 0; -} - -static void pkt_print_settings(struct pktcdvd_device *pd) -{ - dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n", - pd->settings.fp ? "Fixed" : "Variable", - pd->settings.size >> 2, - pd->settings.block_mode == 8 ? '1' : '2'); -} - -static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - - cgc->cmd[0] = GPCMD_MODE_SENSE_10; - cgc->cmd[2] = page_code | (page_control << 6); - put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); - cgc->data_direction = CGC_DATA_READ; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc) -{ - memset(cgc->cmd, 0, sizeof(cgc->cmd)); - memset(cgc->buffer, 0, 2); - cgc->cmd[0] = GPCMD_MODE_SELECT_10; - cgc->cmd[1] = 0x10; /* PF */ - put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); - cgc->data_direction = CGC_DATA_WRITE; - return pkt_generic_packet(pd, cgc); -} - -static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) -{ - struct packet_command cgc; - int ret; - - /* set up command and get the disc info */ - init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_DISC_INFO; - cgc.cmd[8] = cgc.buflen = 2; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - /* not all drives have the same disc_info length, so requeue - * packet with the length the drive tells us it can supply - */ - cgc.buflen = be16_to_cpu(di->disc_information_length) + - sizeof(di->disc_information_length); - - if (cgc.buflen > sizeof(disc_information)) - cgc.buflen = sizeof(disc_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti) -{ - struct packet_command cgc; - int ret; - - init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); - cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; - cgc.cmd[1] = type & 3; - put_unaligned_be16(track, &cgc.cmd[4]); - cgc.cmd[8] = 8; - cgc.quiet = 1; - - ret = pkt_generic_packet(pd, &cgc); - if (ret) - return ret; - - cgc.buflen = be16_to_cpu(ti->track_information_length) + - sizeof(ti->track_information_length); - - if (cgc.buflen > sizeof(track_information)) - cgc.buflen = sizeof(track_information); - - cgc.cmd[8] = cgc.buflen; - return pkt_generic_packet(pd, &cgc); -} - -static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, - long *last_written) -{ - disc_information di; - track_information ti; - __u32 last_track; - int ret; - - ret = pkt_get_disc_info(pd, &di); - if (ret) - return ret; - - last_track = (di.last_track_msb << 8) | di.last_track_lsb; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - - /* if this track is blank, try the previous. */ - if (ti.blank) { - last_track--; - ret = pkt_get_track_info(pd, last_track, 1, &ti); - if (ret) - return ret; - } - - /* if last recorded field is valid, return it. */ - if (ti.lra_v) { - *last_written = be32_to_cpu(ti.last_rec_address); - } else { - /* make it up instead */ - *last_written = be32_to_cpu(ti.track_start) + - be32_to_cpu(ti.track_size); - if (ti.free_blocks) - *last_written -= (be32_to_cpu(ti.free_blocks) + 7); - } - return 0; -} - -/* - * write mode select package based on pd->settings - */ -static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - write_param_page *wp; - char buffer[128]; - int ret, size; - - /* doesn't apply to DVD+RW or DVD-RAM */ - if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12)) - return 0; - - memset(buffer, 0, sizeof(buffer)); - init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - size = 2 + get_unaligned_be16(&buffer[0]); - pd->mode_offset = get_unaligned_be16(&buffer[6]); - if (size > sizeof(buffer)) - size = sizeof(buffer); - - /* - * now get it all - */ - init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - /* - * write page is offset header + block descriptor length - */ - wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset]; - - wp->fp = pd->settings.fp; - wp->track_mode = pd->settings.track_mode; - wp->write_type = pd->settings.write_type; - wp->data_block_type = pd->settings.block_mode; - - wp->multi_session = 0; - -#ifdef PACKET_USE_LS - wp->link_size = 7; - wp->ls_v = 1; -#endif - - if (wp->data_block_type == PACKET_BLOCK_MODE1) { - wp->session_format = 0; - wp->subhdr2 = 0x20; - } else if (wp->data_block_type == PACKET_BLOCK_MODE2) { - wp->session_format = 0x20; - wp->subhdr2 = 8; -#if 0 - wp->mcn[0] = 0x80; - memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1); -#endif - } else { - /* - * paranoia - */ - dev_err(ddev, "write mode wrong %d\n", wp->data_block_type); - return 1; - } - wp->packet_size = cpu_to_be32(pd->settings.size >> 2); - - cgc.buflen = cgc.cmd[8] = size; - ret = pkt_mode_select(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - pkt_print_settings(pd); - return 0; -} - -/* - * 1 -- we can write to this track, 0 -- we can't - */ -static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) -{ - struct device *ddev = disk_to_dev(pd->disk); - - switch (pd->mmc3_profile) { - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - /* The track is always writable on DVD+RW/DVD-RAM */ - return 1; - default: - break; - } - - if (!ti->packet || !ti->fp) - return 0; - - /* - * "good" settings as per Mt Fuji. - */ - if (ti->rt == 0 && ti->blank == 0) - return 1; - - if (ti->rt == 0 && ti->blank == 1) - return 1; - - if (ti->rt == 1 && ti->blank == 0) - return 1; - - dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); - return 0; -} - -/* - * 1 -- we can write to this disc, 0 -- we can't - */ -static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) -{ - struct device *ddev = disk_to_dev(pd->disk); - - switch (pd->mmc3_profile) { - case 0x0a: /* CD-RW */ - case 0xffff: /* MMC3 not supported */ - break; - case 0x1a: /* DVD+RW */ - case 0x13: /* DVD-RW */ - case 0x12: /* DVD-RAM */ - return 1; - default: - dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile); - return 0; - } - - /* - * for disc type 0xff we should probably reserve a new track. - * but i'm not sure, should we leave this to user apps? probably. - */ - if (di->disc_type == 0xff) { - dev_notice(ddev, "unknown disc - no track?\n"); - return 0; - } - - if (di->disc_type != 0x20 && di->disc_type != 0) { - dev_err(ddev, "wrong disc type (%x)\n", di->disc_type); - return 0; - } - - if (di->erasable == 0) { - dev_err(ddev, "disc not erasable\n"); - return 0; - } - - if (di->border_status == PACKET_SESSION_RESERVED) { - dev_err(ddev, "can't write to last track (reserved)\n"); - return 0; - } - - return 1; -} - -static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - unsigned char buf[12]; - disc_information di; - track_information ti; - int ret, track; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.cmd[0] = GPCMD_GET_CONFIGURATION; - cgc.cmd[8] = 8; - ret = pkt_generic_packet(pd, &cgc); - pd->mmc3_profile = ret ? 0xffff : get_unaligned_be16(&buf[6]); - - memset(&di, 0, sizeof(disc_information)); - memset(&ti, 0, sizeof(track_information)); - - ret = pkt_get_disc_info(pd, &di); - if (ret) { - dev_err(ddev, "failed get_disc\n"); - return ret; - } - - if (!pkt_writable_disc(pd, &di)) - return -EROFS; - - pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; - - track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ - ret = pkt_get_track_info(pd, track, 1, &ti); - if (ret) { - dev_err(ddev, "failed get_track\n"); - return ret; - } - - if (!pkt_writable_track(pd, &ti)) { - dev_err(ddev, "can't write to this track\n"); - return -EROFS; - } - - /* - * we keep packet size in 512 byte units, makes it easier to - * deal with request calculations. - */ - pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; - if (pd->settings.size == 0) { - dev_notice(ddev, "detected zero packet size!\n"); - return -ENXIO; - } - if (pd->settings.size > PACKET_MAX_SECTORS) { - dev_err(ddev, "packet size is too big\n"); - return -EROFS; - } - pd->settings.fp = ti.fp; - pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1); - - if (ti.nwa_v) { - pd->nwa = be32_to_cpu(ti.next_writable); - set_bit(PACKET_NWA_VALID, &pd->flags); - } - - /* - * in theory we could use lra on -RW media as well and just zero - * blocks that haven't been written yet, but in practice that - * is just a no-go. we'll use that for -R, naturally. - */ - if (ti.lra_v) { - pd->lra = be32_to_cpu(ti.last_rec_address); - set_bit(PACKET_LRA_VALID, &pd->flags); - } else { - pd->lra = 0xffffffff; - set_bit(PACKET_LRA_VALID, &pd->flags); - } - - /* - * fine for now - */ - pd->settings.link_loss = 7; - pd->settings.write_type = 0; /* packet */ - pd->settings.track_mode = ti.track_mode; - - /* - * mode1 or mode2 disc - */ - switch (ti.data_mode) { - case PACKET_MODE1: - pd->settings.block_mode = PACKET_BLOCK_MODE1; - break; - case PACKET_MODE2: - pd->settings.block_mode = PACKET_BLOCK_MODE2; - break; - default: - dev_err(ddev, "unknown data mode\n"); - return -EROFS; - } - return 0; -} - -/* - * enable/disable write caching on drive - */ -static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE); - int ret; - - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.buflen = pd->mode_offset + 12; - - /* - * caching mode page might not be there, so quiet this command - */ - cgc.quiet = 1; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0); - if (ret) - return ret; - - /* - * use drive write caching -- we need deferred error handling to be - * able to successfully recover with this option (drive will return good - * status as soon as the cdb is validated). - */ - buf[pd->mode_offset + 10] |= (set << 2); - - cgc.buflen = cgc.cmd[8] = 2 + get_unaligned_be16(&buf[0]); - ret = pkt_mode_select(pd, &cgc); - if (ret) { - dev_err(ddev, "write caching control failed\n"); - pkt_dump_sense(pd, &cgc); - } else if (!ret && set) - dev_notice(ddev, "enabled write caching\n"); - return ret; -} - -static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag) -{ - struct packet_command cgc; - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL; - cgc.cmd[4] = lockflag ? 1 : 0; - return pkt_generic_packet(pd, &cgc); -} - -/* - * Returns drive maximum write speed - */ -static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, - unsigned *write_speed) -{ - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[256+18]; - unsigned char *cap_buf; - int ret, offset; - - cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; - init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); - cgc.sshdr = &sshdr; - - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - cgc.buflen = pd->mode_offset + cap_buf[1] + 2 + - sizeof(struct mode_page_header); - ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - } - - offset = 20; /* Obsoleted field, used by older drives */ - if (cap_buf[1] >= 28) - offset = 28; /* Current write speed selected */ - if (cap_buf[1] >= 30) { - /* If the drive reports at least one "Logical Unit Write - * Speed Performance Descriptor Block", use the information - * in the first block. (contains the highest speed) - */ - int num_spdb = get_unaligned_be16(&cap_buf[30]); - if (num_spdb > 0) - offset = 34; - } - - *write_speed = get_unaligned_be16(&cap_buf[offset]); - return 0; -} - -/* These tables from cdrecord - I don't have orange book */ -/* standard speed CD-RW (1-4x) */ -static char clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* high speed CD-RW (-10x) */ -static char hs_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; -/* ultra high speed CD-RW */ -static char us_clv_to_speed[16] = { - /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ - 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0 -}; - -/* - * reads the maximum media speed from ATIP - */ -static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, - unsigned *speed) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - unsigned char buf[64]; - unsigned int size, st, sp; - int ret; - - init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; /* READ ATIP */ - cgc.cmd[8] = 2; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - size = 2 + get_unaligned_be16(&buf[0]); - if (size > sizeof(buf)) - size = sizeof(buf); - - init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); - cgc.sshdr = &sshdr; - cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; - cgc.cmd[1] = 2; - cgc.cmd[2] = 4; - cgc.cmd[8] = size; - ret = pkt_generic_packet(pd, &cgc); - if (ret) { - pkt_dump_sense(pd, &cgc); - return ret; - } - - if (!(buf[6] & 0x40)) { - dev_notice(ddev, "disc type is not CD-RW\n"); - return 1; - } - if (!(buf[6] & 0x4)) { - dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n"); - return 1; - } - - st = (buf[6] >> 3) & 0x7; /* disc sub-type */ - - sp = buf[16] & 0xf; /* max speed from ATIP A1 field */ - - /* Info from cdrecord */ - switch (st) { - case 0: /* standard speed */ - *speed = clv_to_speed[sp]; - break; - case 1: /* high speed */ - *speed = hs_clv_to_speed[sp]; - break; - case 2: /* ultra high speed */ - *speed = us_clv_to_speed[sp]; - break; - default: - dev_notice(ddev, "unknown disc sub-type %d\n", st); - return 1; - } - if (*speed) { - dev_info(ddev, "maximum media speed: %d\n", *speed); - return 0; - } else { - dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st); - return 1; - } -} - -static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - struct packet_command cgc; - struct scsi_sense_hdr sshdr; - int ret; - - dev_dbg(ddev, "Performing OPC\n"); - - init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); - cgc.sshdr = &sshdr; - cgc.timeout = 60*HZ; - cgc.cmd[0] = GPCMD_SEND_OPC; - cgc.cmd[1] = 1; - ret = pkt_generic_packet(pd, &cgc); - if (ret) - pkt_dump_sense(pd, &cgc); - return ret; -} - -static int pkt_open_write(struct pktcdvd_device *pd) -{ - struct device *ddev = disk_to_dev(pd->disk); - int ret; - unsigned int write_speed, media_write_speed, read_speed; - - ret = pkt_probe_settings(pd); - if (ret) { - dev_dbg(ddev, "failed probe\n"); - return ret; - } - - ret = pkt_set_write_settings(pd); - if (ret) { - dev_notice(ddev, "failed saving write settings\n"); - return -EIO; - } - - pkt_write_caching(pd); - - ret = pkt_get_max_speed(pd, &write_speed); - if (ret) - write_speed = 16 * 177; - switch (pd->mmc3_profile) { - case 0x13: /* DVD-RW */ - case 0x1a: /* DVD+RW */ - case 0x12: /* DVD-RAM */ - dev_notice(ddev, "write speed %ukB/s\n", write_speed); - break; - default: - ret = pkt_media_speed(pd, &media_write_speed); - if (ret) - media_write_speed = 16; - write_speed = min(write_speed, media_write_speed * 177); - dev_notice(ddev, "write speed %ux\n", write_speed / 176); - break; - } - read_speed = write_speed; - - ret = pkt_set_speed(pd, write_speed, read_speed); - if (ret) { - dev_notice(ddev, "couldn't set write speed\n"); - return -EIO; - } - pd->write_speed = write_speed; - pd->read_speed = read_speed; - - ret = pkt_perform_opc(pd); - if (ret) - dev_notice(ddev, "Optimum Power Calibration failed\n"); - - return 0; -} - -/* - * called at open time. - */ -static int pkt_open_dev(struct pktcdvd_device *pd, bool write) -{ - struct device *ddev = disk_to_dev(pd->disk); - int ret; - long lba; - struct request_queue *q; - struct file *bdev_file; - - /* - * We need to re-open the cdrom device without O_NONBLOCK to be able - * to read/write from/to it. It is already opened in O_NONBLOCK mode - * so open should not fail. - */ - bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev, - BLK_OPEN_READ, pd, NULL); - if (IS_ERR(bdev_file)) { - ret = PTR_ERR(bdev_file); - goto out; - } - pd->f_open_bdev = bdev_file; - - ret = pkt_get_last_written(pd, &lba); - if (ret) { - dev_err(ddev, "pkt_get_last_written failed\n"); - goto out_putdev; - } - - set_capacity(pd->disk, lba << 2); - set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2); - - q = bdev_get_queue(file_bdev(pd->bdev_file)); - if (write) { - ret = pkt_open_write(pd); - if (ret) - goto out_putdev; - set_bit(PACKET_WRITABLE, &pd->flags); - } else { - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - clear_bit(PACKET_WRITABLE, &pd->flags); - } - - ret = pkt_set_segment_merging(pd, q); - if (ret) - goto out_putdev; - - if (write) { - if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - dev_err(ddev, "not enough memory for buffers\n"); - ret = -ENOMEM; - goto out_putdev; - } - dev_info(ddev, "%lukB available on disc\n", lba << 1); - } - set_blocksize(bdev_file, CD_FRAMESIZE); - - return 0; - -out_putdev: - fput(bdev_file); -out: - return ret; -} - -/* - * called when the device is closed. makes sure that the device flushes - * the internal cache before we close. - */ -static void pkt_release_dev(struct pktcdvd_device *pd, int flush) -{ - struct device *ddev = disk_to_dev(pd->disk); - - if (flush && pkt_flush_cache(pd)) - dev_notice(ddev, "not flushing cache\n"); - - pkt_lock_door(pd, 0); - - pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - fput(pd->f_open_bdev); - pd->f_open_bdev = NULL; - - pkt_shrink_pktlist(pd); -} - -static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) -{ - if (dev_minor >= MAX_WRITERS) - return NULL; - - dev_minor = array_index_nospec(dev_minor, MAX_WRITERS); - return pkt_devs[dev_minor]; -} - -static int pkt_open(struct gendisk *disk, blk_mode_t mode) -{ - struct pktcdvd_device *pd = NULL; - int ret; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd = pkt_find_dev_from_minor(disk->first_minor); - if (!pd) { - ret = -ENODEV; - goto out; - } - BUG_ON(pd->refcnt < 0); - - pd->refcnt++; - if (pd->refcnt > 1) { - if ((mode & BLK_OPEN_WRITE) && - !test_bit(PACKET_WRITABLE, &pd->flags)) { - ret = -EBUSY; - goto out_dec; - } - } else { - ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE); - if (ret) - goto out_dec; - } - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return 0; - -out_dec: - pd->refcnt--; -out: - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); - return ret; -} - -static void pkt_release(struct gendisk *disk) -{ - struct pktcdvd_device *pd = disk->private_data; - - mutex_lock(&pktcdvd_mutex); - mutex_lock(&ctl_mutex); - pd->refcnt--; - BUG_ON(pd->refcnt < 0); - if (pd->refcnt == 0) { - int flush = test_bit(PACKET_WRITABLE, &pd->flags); - pkt_release_dev(pd, flush); - } - mutex_unlock(&ctl_mutex); - mutex_unlock(&pktcdvd_mutex); -} - - -static void pkt_end_io_read_cloned(struct bio *bio) -{ - struct packet_stacked_data *psd = bio->bi_private; - struct pktcdvd_device *pd = psd->pd; - - psd->bio->bi_status = bio->bi_status; - bio_put(bio); - bio_endio(psd->bio); - mempool_free(psd, &psd_pool); - pkt_bio_finished(pd); -} - -static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) -{ - struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio, - GFP_NOIO, &pkt_bio_set); - struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO); - - psd->pd = pd; - psd->bio = bio; - cloned_bio->bi_private = psd; - cloned_bio->bi_end_io = pkt_end_io_read_cloned; - pd->stats.secs_r += bio_sectors(bio); - pkt_queue_bio(pd, cloned_bio); -} - -static void pkt_make_request_write(struct bio *bio) -{ - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; - sector_t zone; - struct packet_data *pkt; - int was_empty, blocked_bio; - struct pkt_rb_node *node; - - zone = get_zone(bio->bi_iter.bi_sector, pd); - - /* - * If we find a matching packet in state WAITING or READ_WAIT, we can - * just append this bio to that packet. - */ - spin_lock(&pd->cdrw.active_list_lock); - blocked_bio = 0; - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - if (pkt->sector == zone) { - spin_lock(&pkt->lock); - if ((pkt->state == PACKET_WAITING_STATE) || - (pkt->state == PACKET_READ_WAIT_STATE)) { - bio_list_add(&pkt->orig_bios, bio); - pkt->write_size += - bio->bi_iter.bi_size / CD_FRAMESIZE; - if ((pkt->write_size >= pkt->frames) && - (pkt->state == PACKET_WAITING_STATE)) { - atomic_inc(&pkt->run_sm); - wake_up(&pd->wqueue); - } - spin_unlock(&pkt->lock); - spin_unlock(&pd->cdrw.active_list_lock); - return; - } else { - blocked_bio = 1; - } - spin_unlock(&pkt->lock); - } - } - spin_unlock(&pd->cdrw.active_list_lock); - - /* - * Test if there is enough room left in the bio work queue - * (queue size >= congestion on mark). - * If not, wait till the work queue size is below the congestion off mark. - */ - spin_lock(&pd->lock); - if (pd->write_congestion_on > 0 - && pd->bio_queue_size >= pd->write_congestion_on) { - struct wait_bit_queue_entry wqe; - - init_wait_var_entry(&wqe, &pd->congested, 0); - for (;;) { - prepare_to_wait_event(__var_waitqueue(&pd->congested), - &wqe.wq_entry, - TASK_UNINTERRUPTIBLE); - if (pd->bio_queue_size <= pd->write_congestion_off) - break; - pd->congested = true; - spin_unlock(&pd->lock); - schedule(); - spin_lock(&pd->lock); - } - } - spin_unlock(&pd->lock); - - /* - * No matching packet found. Store the bio in the work queue. - */ - node = mempool_alloc(&pd->rb_pool, GFP_NOIO); - node->bio = bio; - spin_lock(&pd->lock); - BUG_ON(pd->bio_queue_size < 0); - was_empty = (pd->bio_queue_size == 0); - pkt_rbtree_insert(pd, node); - spin_unlock(&pd->lock); - - /* - * Wake up the worker thread. - */ - atomic_set(&pd->scan_queue, 1); - if (was_empty) { - /* This wake_up is required for correct operation */ - wake_up(&pd->wqueue); - } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) { - /* - * This wake up is not required for correct operation, - * but improves performance in some cases. - */ - wake_up(&pd->wqueue); - } -} - -static void pkt_submit_bio(struct bio *bio) -{ - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; - struct device *ddev = disk_to_dev(pd->disk); - struct bio *split; - - bio = bio_split_to_limits(bio); - if (!bio) - return; - - dev_dbg(ddev, "start = %6llx stop = %6llx\n", - bio->bi_iter.bi_sector, bio_end_sector(bio)); - - /* - * Clone READ bios so we can have our own bi_end_io callback. - */ - if (bio_data_dir(bio) == READ) { - pkt_make_request_read(pd, bio); - return; - } - - if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - dev_notice(ddev, "WRITE for ro device (%llu)\n", bio->bi_iter.bi_sector); - goto end_io; - } - - if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { - dev_err(ddev, "wrong bio size\n"); - goto end_io; - } - - do { - sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); - sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); - - if (last_zone != zone) { - BUG_ON(last_zone != zone + pd->settings.size); - - split = bio_split(bio, last_zone - - bio->bi_iter.bi_sector, - GFP_NOIO, &pkt_bio_set); - bio_chain(split, bio); - } else { - split = bio; - } - - pkt_make_request_write(split); - } while (split != bio); - - return; -end_io: - bio_io_error(bio); -} - -static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) -{ - struct device *ddev = disk_to_dev(pd->disk); - int i; - struct file *bdev_file; - struct scsi_device *sdev; - - if (pd->pkt_dev == dev) { - dev_err(ddev, "recursive setup not allowed\n"); - return -EBUSY; - } - for (i = 0; i < MAX_WRITERS; i++) { - struct pktcdvd_device *pd2 = pkt_devs[i]; - if (!pd2) - continue; - if (file_bdev(pd2->bdev_file)->bd_dev == dev) { - dev_err(ddev, "%pg already setup\n", - file_bdev(pd2->bdev_file)); - return -EBUSY; - } - if (pd2->pkt_dev == dev) { - dev_err(ddev, "can't chain pktcdvd devices\n"); - return -EBUSY; - } - } - - bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, - NULL, NULL); - if (IS_ERR(bdev_file)) - return PTR_ERR(bdev_file); - sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue); - if (!sdev) { - fput(bdev_file); - return -EINVAL; - } - put_device(&sdev->sdev_gendev); - - /* This is safe, since we have a reference from open(). */ - __module_get(THIS_MODULE); - - pd->bdev_file = bdev_file; - - atomic_set(&pd->cdrw.pending_bios, 0); - pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); - if (IS_ERR(pd->cdrw.thread)) { - dev_err(ddev, "can't start kernel thread\n"); - goto out_mem; - } - - proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd); - dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file)); - return 0; - -out_mem: - fput(bdev_file); - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - return -ENOMEM; -} - -static int pkt_ioctl(struct block_device *bdev, blk_mode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct pktcdvd_device *pd = bdev->bd_disk->private_data; - struct device *ddev = disk_to_dev(pd->disk); - int ret; - - dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - - mutex_lock(&pktcdvd_mutex); - switch (cmd) { - case CDROMEJECT: - /* - * The door gets locked when the device is opened, so we - * have to unlock it or else the eject command fails. - */ - if (pd->refcnt == 1) - pkt_lock_door(pd, 0); - fallthrough; - /* - * forward selected CDROM ioctls to CD-ROM, for UDF - */ - case CDROMMULTISESSION: - case CDROMREADTOCENTRY: - case CDROM_LAST_WRITTEN: - case CDROM_SEND_PACKET: - case SCSI_IOCTL_SEND_COMMAND: - if (!bdev->bd_disk->fops->ioctl) - ret = -ENOTTY; - else - ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); - break; - default: - dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd); - ret = -ENOTTY; - } - mutex_unlock(&pktcdvd_mutex); - - return ret; -} - -static unsigned int pkt_check_events(struct gendisk *disk, - unsigned int clearing) -{ - struct pktcdvd_device *pd = disk->private_data; - struct gendisk *attached_disk; - - if (!pd) - return 0; - if (!pd->bdev_file) - return 0; - attached_disk = file_bdev(pd->bdev_file)->bd_disk; - if (!attached_disk || !attached_disk->fops->check_events) - return 0; - return attached_disk->fops->check_events(attached_disk, clearing); -} - -static char *pkt_devnode(struct gendisk *disk, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name); -} - -static const struct block_device_operations pktcdvd_ops = { - .owner = THIS_MODULE, - .submit_bio = pkt_submit_bio, - .open = pkt_open, - .release = pkt_release, - .ioctl = pkt_ioctl, - .compat_ioctl = blkdev_compat_ptr_ioctl, - .check_events = pkt_check_events, - .devnode = pkt_devnode, -}; - -/* - * Set up mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) -{ - struct queue_limits lim = { - .max_hw_sectors = PACKET_MAX_SECTORS, - .logical_block_size = CD_FRAMESIZE, - .features = BLK_FEAT_ROTATIONAL, - }; - int idx; - int ret = -ENOMEM; - struct pktcdvd_device *pd; - struct gendisk *disk; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) - if (!pkt_devs[idx]) - break; - if (idx == MAX_WRITERS) { - pr_err("max %d writers supported\n", MAX_WRITERS); - ret = -EBUSY; - goto out_mutex; - } - - pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL); - if (!pd) - goto out_mutex; - - ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE, - sizeof(struct pkt_rb_node)); - if (ret) - goto out_mem; - - INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); - INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); - spin_lock_init(&pd->cdrw.active_list_lock); - - spin_lock_init(&pd->lock); - spin_lock_init(&pd->iosched.lock); - bio_list_init(&pd->iosched.read_queue); - bio_list_init(&pd->iosched.write_queue); - init_waitqueue_head(&pd->wqueue); - pd->bio_queue = RB_ROOT; - - pd->write_congestion_on = write_congestion_on; - pd->write_congestion_off = write_congestion_off; - - disk = blk_alloc_disk(&lim, NUMA_NO_NODE); - if (IS_ERR(disk)) { - ret = PTR_ERR(disk); - goto out_mem; - } - pd->disk = disk; - disk->major = pktdev_major; - disk->first_minor = idx; - disk->minors = 1; - disk->fops = &pktcdvd_ops; - disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; - snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx); - disk->private_data = pd; - - pd->pkt_dev = MKDEV(pktdev_major, idx); - ret = pkt_new_dev(pd, dev); - if (ret) - goto out_mem2; - - /* inherit events of the host device */ - disk->events = file_bdev(pd->bdev_file)->bd_disk->events; - - ret = add_disk(disk); - if (ret) - goto out_mem2; - - pkt_sysfs_dev_new(pd); - pkt_debugfs_dev_new(pd); - - pkt_devs[idx] = pd; - if (pkt_dev) - *pkt_dev = pd->pkt_dev; - - mutex_unlock(&ctl_mutex); - return 0; - -out_mem2: - put_disk(disk); -out_mem: - mempool_exit(&pd->rb_pool); - kfree(pd); -out_mutex: - mutex_unlock(&ctl_mutex); - pr_err("setup of pktcdvd device failed\n"); - return ret; -} - -/* - * Tear down mapping from pktcdvd device to CD-ROM device. - */ -static int pkt_remove_dev(dev_t pkt_dev) -{ - struct pktcdvd_device *pd; - struct device *ddev; - int idx; - int ret = 0; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - for (idx = 0; idx < MAX_WRITERS; idx++) { - pd = pkt_devs[idx]; - if (pd && (pd->pkt_dev == pkt_dev)) - break; - } - if (idx == MAX_WRITERS) { - pr_debug("dev not setup\n"); - ret = -ENXIO; - goto out; - } - - if (pd->refcnt > 0) { - ret = -EBUSY; - goto out; - } - - ddev = disk_to_dev(pd->disk); - - if (!IS_ERR(pd->cdrw.thread)) - kthread_stop(pd->cdrw.thread); - - pkt_devs[idx] = NULL; - - pkt_debugfs_dev_remove(pd); - pkt_sysfs_dev_remove(pd); - - fput(pd->bdev_file); - - remove_proc_entry(pd->disk->disk_name, pkt_proc); - dev_notice(ddev, "writer unmapped\n"); - - del_gendisk(pd->disk); - put_disk(pd->disk); - - mempool_exit(&pd->rb_pool); - kfree(pd); - - /* This is safe: open() is still holding a reference. */ - module_put(THIS_MODULE); - -out: - mutex_unlock(&ctl_mutex); - return ret; -} - -static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) -{ - struct pktcdvd_device *pd; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - - pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); - if (pd) { - ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev); - ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); - } else { - ctrl_cmd->dev = 0; - ctrl_cmd->pkt_dev = 0; - } - ctrl_cmd->num_devices = MAX_WRITERS; - - mutex_unlock(&ctl_mutex); -} - -static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - struct pkt_ctrl_command ctrl_cmd; - int ret = 0; - dev_t pkt_dev = 0; - - if (cmd != PACKET_CTRL_CMD) - return -ENOTTY; - - if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - - switch (ctrl_cmd.command) { - case PKT_CTRL_CMD_SETUP: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); - ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); - break; - case PKT_CTRL_CMD_TEARDOWN: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); - break; - case PKT_CTRL_CMD_STATUS: - pkt_get_status(&ctrl_cmd); - break; - default: - return -ENOTTY; - } - - if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command))) - return -EFAULT; - return ret; -} - -#ifdef CONFIG_COMPAT -static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); -} -#endif - -static const struct file_operations pkt_ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = pkt_ctl_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = pkt_ctl_compat_ioctl, -#endif - .owner = THIS_MODULE, -}; - -static struct miscdevice pkt_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = DRIVER_NAME, - .nodename = "pktcdvd/control", - .fops = &pkt_ctl_fops -}; - -static int __init pkt_init(void) -{ - int ret; - - mutex_init(&ctl_mutex); - - ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE, - sizeof(struct packet_stacked_data)); - if (ret) - return ret; - ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) { - mempool_exit(&psd_pool); - return ret; - } - - ret = register_blkdev(pktdev_major, DRIVER_NAME); - if (ret < 0) { - pr_err("unable to register block device\n"); - goto out2; - } - if (!pktdev_major) - pktdev_major = ret; - - ret = pkt_sysfs_init(); - if (ret) - goto out; - - pkt_debugfs_init(); - - ret = misc_register(&pkt_misc); - if (ret) { - pr_err("unable to register misc device\n"); - goto out_misc; - } - - pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL); - - return 0; - -out_misc: - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); -out: - unregister_blkdev(pktdev_major, DRIVER_NAME); -out2: - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); - return ret; -} - -static void __exit pkt_exit(void) -{ - remove_proc_entry("driver/"DRIVER_NAME, NULL); - misc_deregister(&pkt_misc); - - pkt_debugfs_cleanup(); - pkt_sysfs_cleanup(); - - unregister_blkdev(pktdev_major, DRIVER_NAME); - mempool_exit(&psd_pool); - bioset_exit(&pkt_bio_set); -} - -MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); -MODULE_AUTHOR("Jens Axboe <axboe@suse.de>"); -MODULE_LICENSE("GPL"); - -module_init(pkt_init); -module_exit(pkt_exit); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index b5727dea15bd..7af21fe67671 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -957,8 +957,10 @@ static bool vdc_port_mpgroup_check(struct vio_dev *vdev) dev = device_find_child(vdev->dev.parent, &port_data, vdc_device_probed); - if (dev) + if (dev) { + put_device(dev); return true; + } return false; } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9fd284fa76dc..6561d2a561fa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,8 @@ #define UBLK_MINORS (1U << MINORBITS) +#define UBLK_INVALID_BUF_IDX ((u16)-1) + /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) @@ -70,7 +72,8 @@ | UBLK_F_UPDATE_SIZE \ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ - | UBLK_F_PER_IO_DAEMON) + | UBLK_F_PER_IO_DAEMON \ + | UBLK_F_BUF_REG_OFF_DAEMON) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -82,14 +85,6 @@ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) -struct ublk_rq_data { - refcount_t ref; - - /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */ - u16 buf_index; - void *buf_ctx_handle; -}; - struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -110,8 +105,6 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - struct ublk_auto_buf_reg buf; - u16 tag; }; @@ -155,9 +148,19 @@ struct ublk_uring_cmd_pdu { /* atomic RW with ubq->cancel_lock */ #define UBLK_IO_FLAG_CANCELED 0x80000000 +/* + * Initialize refcount to a large number to include any registered buffers. + * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for + * any buffers registered on the io daemon task. + */ +#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) + struct ublk_io { /* userspace buffer address from io cmd */ - __u64 addr; + union { + __u64 addr; + struct ublk_auto_buf_reg buf; + }; unsigned int flags; int res; @@ -169,7 +172,24 @@ struct ublk_io { }; struct task_struct *task; -}; + + /* + * The number of uses of this I/O by the ublk server + * if user copy or zero copy are enabled: + * - UBLK_REFCOUNT_INIT from dispatch to the server + * until UBLK_IO_COMMIT_AND_FETCH_REQ + * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each io_uring registered buffer not registered on task + * The I/O can only be completed once all references are dropped. + * User copy and buffer registration operations are only permitted + * if the reference count is nonzero. + */ + refcount_t ref; + /* Count of buffers registered on task and not yet unregistered */ + unsigned task_registered_buffers; + + void *buf_ctx_handle; +} ____cacheline_aligned_in_smp; struct ublk_queue { int q_id; @@ -216,6 +236,9 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; + struct mutex cancel_mutex; + bool canceling; + pid_t ublksrv_tgid; }; /* header of ublk_params */ @@ -228,7 +251,8 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, int tag, size_t offset); + const struct ublk_queue *ubq, struct ublk_io *io, + size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc * @@ -673,38 +697,29 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, - struct request *req) + struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - refcount_set(&data->ref, 1); - } + if (ublk_need_req_ref(ubq)) + refcount_set(&io->ref, UBLK_REFCOUNT_INIT); } -static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_get_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - return refcount_inc_not_zero(&data->ref); - } + return refcount_inc_not_zero(&io->ref); +} - return true; +static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) +{ + if (refcount_dec_and_test(&io->ref)) + __ublk_complete_rq(req); } -static inline void ublk_put_req_ref(const struct ublk_queue *ubq, - struct request *req) +static inline bool ublk_sub_req_ref(struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; - if (refcount_dec_and_test(&data->ref)) - __ublk_complete_rq(req); - } else { - __ublk_complete_rq(req); - } + io->task_registered_buffers = 0; + return refcount_sub_and_test(sub_refs, &io->ref); } static inline bool ublk_need_get_data(const struct ublk_queue *ubq) @@ -981,7 +996,7 @@ static inline bool ublk_need_unmap_req(const struct request *req) } static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -1005,7 +1020,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, static int ublk_unmap_io(const struct ublk_queue *ubq, const struct request *req, - struct ublk_io *io) + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -1140,7 +1155,7 @@ static inline void __ublk_complete_rq(struct request *req) if (blk_update_request(req, BLK_STS_OK, io->res)) blk_mq_requeue_request(req, true); - else + else if (likely(!blk_should_fake_timeout(req->q))) __blk_mq_end_request(req, BLK_STS_OK); return; @@ -1188,39 +1203,33 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } -static void ublk_auto_buf_reg_fallback(struct request *req) +static void +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) { - const struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + unsigned tag = io - ubq->ios; + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; - refcount_set(&data->ref, 1); } -static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io, - unsigned int issue_flags) +static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, unsigned int issue_flags) { - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd); - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); int ret; ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, - pdu->buf.index, issue_flags); + io->buf.index, issue_flags); if (ret) { - if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { - ublk_auto_buf_reg_fallback(req); + if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + ublk_auto_buf_reg_fallback(ubq, io); return true; } blk_mq_end_request(req, BLK_STS_IOERR); return false; } - /* one extra reference is dropped by ublk_io_release */ - refcount_set(&data->ref, 2); - data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); - /* store buffer index in request payload */ - data->buf_index = pdu->buf.index; + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; return true; } @@ -1229,10 +1238,10 @@ static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, struct request *req, struct ublk_io *io, unsigned int issue_flags) { + ublk_init_req_ref(ubq, io); if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(req, io, issue_flags); + return ublk_auto_buf_reg(ubq, req, io, issue_flags); - ublk_init_req_ref(ubq, req); return true; } @@ -1356,14 +1365,23 @@ static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; - struct ublk_io *io = &ubq->ios[rq->tag]; + pid_t tgid = ubq->dev->ublksrv_tgid; + struct task_struct *p; + struct pid *pid; - if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { - send_sig(SIGKILL, io->task, 0); - return BLK_EH_DONE; - } + if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) + return BLK_EH_RESET_TIMER; - return BLK_EH_RESET_TIMER; + if (unlikely(!tgid)) + return BLK_EH_RESET_TIMER; + + rcu_read_lock(); + pid = find_vpid(tgid); + p = pid_task(pid, PIDTYPE_PID); + if (p) + send_sig(SIGKILL, p, 0); + rcu_read_unlock(); + return BLK_EH_DONE; } static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, @@ -1504,6 +1522,9 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) put_task_struct(io->task); io->task = NULL; } + + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); } } @@ -1515,6 +1536,7 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) return -EBUSY; filp->private_data = ub; + ub->ublksrv_tgid = current->tgid; return 0; } @@ -1529,6 +1551,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) ub->mm = NULL; ub->nr_queues_ready = 0; ub->nr_privileged_daemon = 0; + ub->ublksrv_tgid = -1; } static struct gendisk *ublk_get_disk(struct ublk_device *ub) @@ -1550,6 +1573,27 @@ static void ublk_put_disk(struct gendisk *disk) put_device(disk_to_dev(disk)); } +/* + * Use this function to ensure that ->canceling is consistently set for + * the device and all queues. Do not set these flags directly. + * + * Caller must ensure that: + * - cancel_mutex is held. This ensures that there is no concurrent + * access to ub->canceling and no concurrent writes to ubq->canceling. + * - there are no concurrent reads of ubq->canceling from the queue_rq + * path. This can be done by quiescing the queue, or through other + * means. + */ +static void ublk_set_canceling(struct ublk_device *ub, bool canceling) + __must_hold(&ub->cancel_mutex) +{ + int i; + + ub->canceling = canceling; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->canceling = canceling; +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; @@ -1578,12 +1622,11 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * All requests may be inflight, so ->canceling may not be set, set * it now. */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = true; - ublk_abort_queue(ub, ubq); - } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, true); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_abort_queue(ub, ublk_get_queue(ub, i)); + mutex_unlock(&ub->cancel_mutex); blk_mq_kick_requeue_list(disk->queue); /* @@ -1706,23 +1749,17 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) } } -/* Must be called when queue is frozen */ -static void ublk_mark_queue_canceling(struct ublk_queue *ubq) +static void ublk_start_cancel(struct ublk_device *ub) { - spin_lock(&ubq->cancel_lock); - if (!ubq->canceling) - ubq->canceling = true; - spin_unlock(&ubq->cancel_lock); -} - -static void ublk_start_cancel(struct ublk_queue *ubq) -{ - struct ublk_device *ub = ubq->dev; struct gendisk *disk = ublk_get_disk(ub); /* Our disk has been dead */ if (!disk) return; + + mutex_lock(&ub->cancel_mutex); + if (ub->canceling) + goto out; /* * Now we are serialized with ublk_queue_rq() * @@ -1731,8 +1768,10 @@ static void ublk_start_cancel(struct ublk_queue *ubq) * touch completed uring_cmd */ blk_mq_quiesce_queue(disk->queue); - ublk_mark_queue_canceling(ubq); + ublk_set_canceling(ub, true); blk_mq_unquiesce_queue(disk->queue); +out: + mutex_unlock(&ub->cancel_mutex); ublk_put_disk(disk); } @@ -1805,8 +1844,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (WARN_ON_ONCE(task && task != io->task)) return; - if (!ubq->canceling) - ublk_start_cancel(ubq); + ublk_start_cancel(ubq->dev); WARN_ON_ONCE(io->cmd != cmd); ublk_cancel_cmd(ubq, pdu->tag, issue_flags); @@ -1930,9 +1968,11 @@ static void ublk_reset_io_flags(struct ublk_device *ub) for (j = 0; j < ubq->q_depth; j++) ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; spin_unlock(&ubq->cancel_lock); - ubq->canceling = false; ubq->fail_io = false; } + mutex_lock(&ub->cancel_mutex); + ublk_set_canceling(ub, false); + mutex_unlock(&ub->cancel_mutex); } /* device can only be started after all IOs are ready */ @@ -1967,12 +2007,66 @@ static inline int ublk_check_cmd_op(u32 cmd_op) return 0; } -static inline void ublk_fill_io_cmd(struct ublk_io *io, - struct io_uring_cmd *cmd, unsigned long buf_addr) +static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (io->buf.reserved0 || io->buf.reserved1) + return -EINVAL; + + if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + return -EINVAL; + return 0; +} + +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) { + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { + io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; + + /* + * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` + * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same + * `io_ring_ctx`. + * + * If this uring_cmd's io_ring_ctx isn't same with the + * one for registering the buffer, it is ublk server's + * responsibility for unregistering the buffer, otherwise + * this ublk request gets stuck. + */ + if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) + *buf_idx = io->buf.index; + } + + return ublk_set_auto_buf_reg(io, cmd); +} + +/* Once we return, `io->req` can't be used any more */ +static inline struct request * +ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) +{ + struct request *req = io->req; + io->cmd = cmd; io->flags |= UBLK_IO_FLAG_ACTIVE; + /* now this cmd slot is owned by ublk driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + + return req; +} + +static inline int +ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr, + u16 *buf_idx) +{ + if (ublk_support_auto_buf_reg(ubq)) + return ublk_handle_auto_buf_reg(io, cmd, buf_idx); + io->addr = buf_addr; + return 0; } static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, @@ -1990,30 +2084,25 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); } -static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd) -{ - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - - pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); - - if (pdu->buf.reserved0 || pdu->buf.reserved1) - return -EINVAL; - - if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) - return -EINVAL; - return 0; -} - static void ublk_io_release(void *priv) { struct request *rq = priv; struct ublk_queue *ubq = rq->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[rq->tag]; - ublk_put_req_ref(ubq, rq); + /* + * task_registered_buffers may be 0 if buffers were registered off task + * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. + */ + if (current == io->task && io->task_registered_buffers) + io->task_registered_buffers--; + else + ublk_put_req_ref(io, rq); } static int ublk_register_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, unsigned int tag, + const struct ublk_queue *ubq, + struct ublk_io *io, unsigned int index, unsigned int issue_flags) { struct ublk_device *ub = cmd->file->private_data; @@ -2023,30 +2112,75 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_support_zero_copy(ubq)) return -EINVAL; - req = __ublk_check_and_get_req(ub, ubq, tag, 0); + req = __ublk_check_and_get_req(ub, ubq, io, 0); if (!req) return -EINVAL; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, issue_flags); if (ret) { - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } return 0; } +static int +ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, + const struct ublk_queue *ubq, struct ublk_io *io, + unsigned index, unsigned issue_flags) +{ + unsigned new_registered_buffers; + struct request *req = io->req; + int ret; + + /* + * Ensure there are still references for ublk_sub_req_ref() to release. + * If not, fall back on the thread-safe buffer registration. + */ + new_registered_buffers = io->task_registered_buffers + 1; + if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) + return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); + + if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) + return ret; + + io->task_registered_buffers = new_registered_buffers; + return 0; +} + static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, - const struct ublk_queue *ubq, + const struct ublk_device *ub, unsigned int index, unsigned int issue_flags) { - if (!ublk_support_zero_copy(ubq)) + if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; return io_buffer_unregister_bvec(cmd, index, issue_flags); } +static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) +{ + if (ublk_need_map_io(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_need_get_data(ubq)) + return -EINVAL; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + return -EINVAL; + } + return 0; +} + static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, struct ublk_io *io, __u64 buf_addr) { @@ -2073,26 +2207,11 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); - if (ublk_need_map_io(ubq)) { - /* - * FETCH_RQ has to provide IO buffer if NEED GET - * DATA is not enabled - */ - if (!buf_addr && !ublk_need_get_data(ubq)) - goto out; - } else if (buf_addr) { - /* User copy requires addr to be unset */ - ret = -EINVAL; + ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); + if (ret) goto out; - } - if (ublk_support_auto_buf_reg(ubq)) { - ret = ublk_set_auto_buf_reg(cmd); - if (ret) - goto out; - } - - ublk_fill_io_cmd(io, cmd, buf_addr); WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub, ubq); out: @@ -2100,10 +2219,8 @@ out: return ret; } -static int ublk_commit_and_fetch(const struct ublk_queue *ubq, - struct ublk_io *io, struct io_uring_cmd *cmd, - const struct ublksrv_io_cmd *ub_cmd, - unsigned int issue_flags) +static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, + struct ublk_io *io, __u64 buf_addr) { struct request *req = io->req; @@ -2112,10 +2229,10 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq, * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || + if (!buf_addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) return -EINVAL; - } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) { + } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { /* * User copy requires addr to be unset when command is * not zone append @@ -2123,48 +2240,17 @@ static int ublk_commit_and_fetch(const struct ublk_queue *ubq, return -EINVAL; } - if (ublk_support_auto_buf_reg(ubq)) { - int ret; - - /* - * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` - * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same - * `io_ring_ctx`. - * - * If this uring_cmd's io_ring_ctx isn't same with the - * one for registering the buffer, it is ublk server's - * responsibility for unregistering the buffer, otherwise - * this ublk request gets stuck. - */ - if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) - io_buffer_unregister_bvec(cmd, data->buf_index, - issue_flags); - io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; - } - - ret = ublk_set_auto_buf_reg(cmd); - if (ret) - return ret; - } - - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - - /* now this cmd slot is owned by ublk driver */ - io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; - io->res = ub_cmd->result; - - if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = ub_cmd->zone_append_lba; - - if (likely(!blk_should_fake_timeout(req->q))) - ublk_put_req_ref(ubq, req); - return 0; } +static bool ublk_need_complete_req(const struct ublk_queue *ubq, + struct ublk_io *io) +{ + if (ublk_need_req_ref(ubq)) + return ublk_sub_req_ref(io); + return true; +} + static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, struct request *req) { @@ -2187,19 +2273,33 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) { + u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; - struct task_struct *task; struct ublk_queue *ubq; struct ublk_io *io; u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; - int ret = -EINVAL; struct request *req; + int ret; + bool compl; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); + ret = ublk_check_cmd_op(cmd_op); + if (ret) + goto out; + + /* + * io_buffer_unregister_bvec() doesn't access the ubq or io, + * so no need to validate the q_id, tag, or task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, + issue_flags); + + ret = -EINVAL; if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; @@ -2209,21 +2309,37 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, goto out; io = &ubq->ios[tag]; - task = READ_ONCE(io->task); - if (task && task != current) + /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ + if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { + ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); + if (ret) + goto out; + ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + if (ret) + goto out; + + ublk_prep_cancel(cmd, issue_flags, ubq, tag); + return -EIOCBQUEUED; + } + + if (READ_ONCE(io->task) != current) { + /* + * ublk_register_io_buf() accesses only the io's refcount, + * so can be handled on any task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) + return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); + goto out; + } /* there is pending io cmd, something must be wrong */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { ret = -EBUSY; goto out; } - /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */ - if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) && - _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ) - goto out; - /* * ensure that the user issues UBLK_IO_NEED_GET_DATA * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. @@ -2232,23 +2348,27 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; - ret = ublk_check_cmd_op(cmd_op); - if (ret) - goto out; - - ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); - case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags); - case UBLK_IO_FETCH_REQ: - ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); + case UBLK_IO_COMMIT_AND_FETCH_REQ: + ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); if (ret) goto out; - break; - case UBLK_IO_COMMIT_AND_FETCH_REQ: - ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags); + io->res = ub_cmd->result; + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); + compl = ublk_need_complete_req(ubq, io); + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ub_cmd->zone_append_lba; + if (compl) + __ublk_complete_rq(req); + if (ret) goto out; break; @@ -2258,9 +2378,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * uring_cmd active first and prepare for handling new requeued * request */ - req = io->req; - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + req = ublk_fill_io_cmd(io, cmd); + ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); + WARN_ON_ONCE(ret); if (likely(ublk_get_data(ubq, io, req))) { __ublk_prep_compl_io_cmd(io, req); return UBLK_IO_RES_OK; @@ -2279,15 +2399,20 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, int tag, size_t offset) + const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) { + unsigned tag = io - ubq->ios; struct request *req; + /* + * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, + * which would overwrite it with io->cmd + */ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); if (!req) return NULL; - if (!ublk_get_req_ref(ubq, req)) + if (!ublk_get_req_ref(io)) return NULL; if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) @@ -2301,7 +2426,7 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, return req; fail_put: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return NULL; } @@ -2368,7 +2493,8 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, } static struct request *ublk_check_and_get_req(struct kiocb *iocb, - struct iov_iter *iter, size_t *off, int dir) + struct iov_iter *iter, size_t *off, int dir, + struct ublk_io **io) { struct ublk_device *ub = iocb->ki_filp->private_data; struct ublk_queue *ubq; @@ -2402,7 +2528,8 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (tag >= ubq->q_depth) return ERR_PTR(-EINVAL); - req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + *io = &ubq->ios[tag]; + req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); if (!req) return ERR_PTR(-EINVAL); @@ -2415,42 +2542,40 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, *off = buf_off; return req; fail: - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(*io, req); return ERR_PTR(-EACCES); } static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); if (IS_ERR(req)) return PTR_ERR(req); ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); - ubq = req->mq_hctx->driver_data; - ublk_put_req_ref(ubq, req); + ublk_put_req_ref(io, req); return ret; } @@ -2475,6 +2600,8 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) struct ublk_io *io = &ubq->ios[i]; if (io->task) put_task_struct(io->task); + WARN_ON_ONCE(refcount_read(&io->ref)); + WARN_ON_ONCE(io->task_registered_buffers); } if (ubq->io_cmd_buf) @@ -2513,7 +2640,7 @@ static void ublk_deinit_queues(struct ublk_device *ub) for (i = 0; i < nr_queues; i++) ublk_deinit_queue(ub, i); - kfree(ub->__queues); + kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) @@ -2524,7 +2651,7 @@ static int ublk_init_queues(struct ublk_device *ub) int i, ret = -ENOMEM; ub->queue_size = ubq_size; - ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); if (!ub->__queues) return ret; @@ -2580,6 +2707,7 @@ static void ublk_cdev_rel(struct device *dev) ublk_deinit_queues(ub); ublk_free_dev_number(ub); mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); } @@ -2627,7 +2755,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; - ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } @@ -2729,6 +2856,9 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; + mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { @@ -2933,6 +3063,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); + mutex_init(&ub->cancel_mutex); ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2953,7 +3084,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | - UBLK_F_PER_IO_DAEMON; + UBLK_F_PER_IO_DAEMON | + UBLK_F_BUF_REG_OFF_DAEMON; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | @@ -3003,6 +3135,7 @@ out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: mutex_destroy(&ub->mutex); + mutex_destroy(&ub->cancel_mutex); kfree(ub); out_unlock: mutex_unlock(&ublk_ctl_mutex); @@ -3227,6 +3360,9 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, header->dev_id); + if (ub->ublksrv_tgid != ublksrv_pid) + return -EINVAL; + mutex_lock(&ub->mutex); if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; @@ -3340,7 +3476,7 @@ static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, /* zero means wait forever */ u64 timeout_ms = header->data[0]; struct gendisk *disk; - int i, ret = -ENODEV; + int ret = -ENODEV; if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) return -EOPNOTSUPP; @@ -3357,14 +3493,12 @@ static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, if (ub->dev_info.state != UBLK_S_DEV_LIVE) goto put_disk; - /* Mark all queues as canceling */ + /* Mark the device as canceling */ + mutex_lock(&ub->cancel_mutex); blk_mq_quiesce_queue(disk->queue); - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = true; - } + ublk_set_canceling(ub, true); blk_mq_unquiesce_queue(disk->queue); + mutex_unlock(&ub->cancel_mutex); if (!timeout_ms) timeout_ms = UINT_MAX; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 30bca8cb7106..e649fa67bac1 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -976,9 +976,8 @@ static int init_vq(struct virtio_blk *vblk) return -EINVAL; } - num_vqs = min_t(unsigned int, - min_not_zero(num_request_queues, nr_cpu_ids), - num_vqs); + num_vqs = blk_mq_num_possible_queues( + min_not_zero(num_request_queues, num_vqs)); num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1); diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index d26a58c67e95..b1bd1daa0060 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/cpuhotplug.h> #include <linux/vmalloc.h> +#include <linux/sysfs.h> #include "zcomp.h" @@ -89,23 +90,21 @@ bool zcomp_available_algorithm(const char *comp) } /* show available compressors */ -ssize_t zcomp_available_show(const char *comp, char *buf) +ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at) { - ssize_t sz = 0; int i; for (i = 0; i < ARRAY_SIZE(backends) - 1; i++) { if (!strcmp(comp, backends[i]->name)) { - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]->name); + at += sysfs_emit_at(buf, at, "[%s] ", + backends[i]->name); } else { - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]->name); + at += sysfs_emit_at(buf, at, "%s ", backends[i]->name); } } - sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); - return sz; + at += sysfs_emit_at(buf, at, "\n"); + return at; } struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 4acffe671a5e..eacfd3f7d61d 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -79,7 +79,7 @@ struct zcomp { int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node); int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); -ssize_t zcomp_available_show(const char *comp, char *buf); +ssize_t zcomp_available_show(const char *comp, char *buf, ssize_t at); bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 54c57103715f..8acad3cc6e6e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -373,7 +373,7 @@ static ssize_t initstate_show(struct device *dev, val = init_done(zram); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%u\n", val); + return sysfs_emit(buf, "%u\n", val); } static ssize_t disksize_show(struct device *dev, @@ -381,7 +381,7 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + return sysfs_emit(buf, "%llu\n", zram->disksize); } static ssize_t mem_limit_store(struct device *dev, @@ -532,7 +532,7 @@ static ssize_t writeback_limit_enable_show(struct device *dev, spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t writeback_limit_store(struct device *dev, @@ -567,7 +567,7 @@ static ssize_t writeback_limit_show(struct device *dev, spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } static void reset_bdev(struct zram *zram) @@ -1225,12 +1225,13 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) zram->comp_algs[prio] = alg; } -static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf) +static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, + char *buf, ssize_t at) { ssize_t sz; down_read(&zram->init_lock); - sz = zcomp_available_show(zram->comp_algs[prio], buf); + sz = zcomp_available_show(zram->comp_algs[prio], buf, at); up_read(&zram->init_lock); return sz; @@ -1387,7 +1388,7 @@ static ssize_t comp_algorithm_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf); + return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0); } static ssize_t comp_algorithm_store(struct device *dev, @@ -1415,8 +1416,8 @@ static ssize_t recomp_algorithm_show(struct device *dev, if (!zram->comp_algs[prio]) continue; - sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio); - sz += __comp_algorithm_show(zram, prio, buf + sz); + sz += sysfs_emit_at(buf, sz, "#%d: ", prio); + sz += __comp_algorithm_show(zram, prio, buf, sz); } return sz; @@ -1488,7 +1489,7 @@ static ssize_t io_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "%8llu %8llu 0 %8llu\n", (u64)atomic64_read(&zram->stats.failed_reads), (u64)atomic64_read(&zram->stats.failed_writes), @@ -1518,7 +1519,7 @@ static ssize_t mm_stat_show(struct device *dev, orig_size = atomic64_read(&zram->stats.pages_stored); max_used = atomic_long_read(&zram->stats.max_used_pages); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), @@ -1543,8 +1544,8 @@ static ssize_t bd_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu\n", + ret = sysfs_emit(buf, + "%8llu %8llu %8llu\n", FOUR_K((u64)atomic64_read(&zram->stats.bd_count)), FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)), FOUR_K((u64)atomic64_read(&zram->stats.bd_writes))); @@ -1562,7 +1563,7 @@ static ssize_t debug_stat_show(struct device *dev, ssize_t ret; down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, + ret = sysfs_emit(buf, "version: %d\n0 %8llu\n", version, (u64)atomic64_read(&zram->stats.miss_free)); @@ -2810,7 +2811,7 @@ static ssize_t hot_add_show(const struct class *class, if (ret < 0) return ret; - return scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return sysfs_emit(buf, "%d\n", ret); } /* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */ static struct class_attribute class_attr_hot_add = diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 21a10552da61..31ba1f8c1f78 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -624,9 +624,6 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) if (check_media_type == 1) cdi->options |= (int) CDO_CHECK_TYPE; - if (CDROM_CAN(CDC_MRW_W)) - cdi->exit = cdrom_mrw_exit; - if (cdi->ops->read_cdda_bpc) cdi->cdda_method = CDDA_BPC_FULL; else @@ -651,9 +648,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi) list_del(&cdi->list); mutex_unlock(&cdrom_mutex); - if (cdi->exit) - cdi->exit(cdi); - cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name); } EXPORT_SYMBOL(unregister_cdrom); @@ -1264,6 +1258,8 @@ void cdrom_release(struct cdrom_device_info *cdi) cd_dbg(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", cdi->name); cdrom_dvd_rw_close_write(cdi); + if (CDROM_CAN(CDC_MRW_W)) + cdrom_mrw_exit(cdi); if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) { cd_dbg(CD_CLOSE, "Unlocking door!\n"); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2ea490b9d370..1492c8552255 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -168,14 +168,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, { const char *err; struct cache_sb_disk *s; - struct page *page; + struct folio *folio; unsigned int i; - page = read_cache_page_gfp(bdev->bd_mapping, - SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); - if (IS_ERR(page)) + folio = mapping_read_folio_gfp(bdev->bd_mapping, + SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL); + if (IS_ERR(folio)) return "IO error"; - s = page_address(page) + offset_in_page(SB_OFFSET); + s = folio_address(folio) + offset_in_folio(folio, SB_OFFSET); sb->offset = le64_to_cpu(s->offset); sb->version = le64_to_cpu(s->version); @@ -272,7 +272,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, *res = s; return NULL; err: - put_page(page); + folio_put(folio); return err; } @@ -1366,7 +1366,7 @@ static CLOSURE_CALLBACK(cached_dev_free) mutex_unlock(&bch_register_lock); if (dc->sb_disk) - put_page(virt_to_page(dc->sb_disk)); + folio_put(virt_to_folio(dc->sb_disk)); if (dc->bdev_file) fput(dc->bdev_file); @@ -2216,7 +2216,7 @@ void bch_cache_release(struct kobject *kobj) free_fifo(&ca->free[i]); if (ca->sb_disk) - put_page(virt_to_page(ca->sb_disk)); + folio_put(virt_to_folio(ca->sb_disk)); if (ca->bdev_file) fput(ca->bdev_file); @@ -2593,7 +2593,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!holder) { ret = -ENOMEM; err = "cannot allocate memory"; - goto out_put_sb_page; + goto out_put_sb_folio; } /* Now reopen in exclusive mode with proper holder */ @@ -2667,8 +2667,8 @@ async_done: out_free_holder: kfree(holder); -out_put_sb_page: - put_page(virt_to_page(sb_disk)); +out_put_sb_folio: + folio_put(virt_to_folio(sb_disk)); out_blkdev_put: if (bdev_file) fput(bdev_file); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 79ba1e3a4770..5ef43231fe77 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -253,17 +253,35 @@ MODULE_PARM_DESC(max_read_size, "Maximum size of a read request"); static unsigned int max_write_size = 0; module_param(max_write_size, uint, 0644); MODULE_PARM_DESC(max_write_size, "Maximum size of a write request"); -static unsigned get_max_request_size(struct crypt_config *cc, bool wrt) + +static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio) { + struct crypt_config *cc = ti->private; unsigned val, sector_align; - val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size); - if (likely(!val)) - val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE; - if (wrt || cc->used_tag_size) { - if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT)) - val = BIO_MAX_VECS << PAGE_SHIFT; - } - sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size); + bool wrt = op_is_write(bio_op(bio)); + + if (wrt) { + /* + * For zoned devices, splitting write operations creates the + * risk of deadlocking queue freeze operations with zone write + * plugging BIO work when the reminder of a split BIO is + * issued. So always allow the entire BIO to proceed. + */ + if (ti->emulate_zone_append) + return bio_sectors(bio); + + val = min_not_zero(READ_ONCE(max_write_size), + DM_CRYPT_DEFAULT_MAX_WRITE_SIZE); + } else { + val = min_not_zero(READ_ONCE(max_read_size), + DM_CRYPT_DEFAULT_MAX_READ_SIZE); + } + + if (wrt || cc->used_tag_size) + val = min(val, BIO_MAX_VECS << PAGE_SHIFT); + + sector_align = max(bdev_logical_block_size(cc->dev->bdev), + (unsigned)cc->sector_size); val = round_down(val, sector_align); if (unlikely(!val)) val = sector_align; @@ -3496,7 +3514,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE); + max_sectors = get_max_request_sectors(ti, bio); if (unlikely(bio_sectors(bio) > max_sectors)) dm_accept_partial_bio(bio, max_sectors); @@ -3733,6 +3751,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) max_t(unsigned int, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size); limits->dma_alignment = limits->logical_block_size - 1; + + /* + * For zoned dm-crypt targets, there will be no internal splitting of + * write BIOs to avoid exceeding BIO_MAX_VECS vectors per BIO. But + * without respecting this limit, crypt_alloc_buffer() will trigger a + * BUG(). Avoid this by forcing DM core to split write BIOs to this + * limit. + */ + if (ti->emulate_zone_append) + limits->max_hw_sectors = min(limits->max_hw_sectors, + BIO_MAX_VECS << PAGE_SECTORS_SHIFT); } static struct target_type crypt_target = { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a7dc04bd55e5..5bbbdf8fc1bd 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -458,6 +458,7 @@ static void stripe_io_hints(struct dm_target *ti, struct stripe_c *sc = ti->private; unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + limits->chunk_sectors = sc->chunk_size; limits->io_min = chunk_size; limits->io_opt = chunk_size * sc->stripes; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1726f0f828cc..abfe0392b5a4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1293,8 +1293,9 @@ out: /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management - * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by - * __send_duplicate_bios(). + * operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated + * with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced + * by __send_duplicate_bios(). * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1327,11 +1328,19 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors) unsigned int bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); - BUG_ON(op_is_zone_mgmt(bio_op(bio))); - BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); BUG_ON(bio_sectors > *tio->len_ptr); BUG_ON(n_sectors > bio_sectors); + if (static_branch_unlikely(&zoned_enabled) && + unlikely(bdev_is_zoned(bio->bi_bdev))) { + enum req_op op = bio_op(bio); + + BUG_ON(op_is_zone_mgmt(op)); + BUG_ON(op == REQ_OP_WRITE); + BUG_ON(op == REQ_OP_WRITE_ZEROES); + BUG_ON(op == REQ_OP_ZONE_APPEND); + } + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; @@ -1776,19 +1785,35 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io, } #ifdef CONFIG_BLK_DEV_ZONED -static inline bool dm_zone_bio_needs_split(struct mapped_device *md, - struct bio *bio) +static inline bool dm_zone_bio_needs_split(struct bio *bio) { /* - * For mapped device that need zone append emulation, we must - * split any large BIO that straddles zone boundaries. + * Special case the zone operations that cannot or should not be split. */ - return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && - !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); + switch (bio_op(bio)) { + case REQ_OP_ZONE_APPEND: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return false; + default: + break; + } + + /* + * When mapped devices use the block layer zone write plugging, we must + * split any large BIO to the mapped device limits to not submit BIOs + * that span zone boundaries and to avoid potential deadlocks with + * queue freeze operations. + */ + return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio); } + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) { - return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); + if (!bio_needs_zone_write_plugging(bio)) + return false; + return blk_zone_plug_bio(bio, 0); } static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, @@ -1904,8 +1929,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci) } #else -static inline bool dm_zone_bio_needs_split(struct mapped_device *md, - struct bio *bio) +static inline bool dm_zone_bio_needs_split(struct bio *bio) { return false; } @@ -1932,9 +1956,7 @@ static void dm_split_and_process_bio(struct mapped_device *md, is_abnormal = is_abnormal_io(bio); if (static_branch_unlikely(&zoned_enabled)) { - /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */ - need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) && - (is_abnormal || dm_zone_bio_needs_split(md, bio)); + need_split = is_abnormal || dm_zone_bio_needs_split(bio); } else { need_split = is_abnormal; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 0f03b21e66e4..046fe85c76fe 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -636,9 +636,6 @@ static void __mddev_put(struct mddev *mddev) mddev->ctime || mddev->hold_active) return; - /* Array is not configured at all, and not held active, so destroy it */ - set_bit(MD_DELETED, &mddev->flags); - /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. @@ -873,6 +870,16 @@ void mddev_unlock(struct mddev *mddev) kobject_del(&rdev->kobj); export_rdev(rdev, mddev); } + + /* Call del_gendisk after release reconfig_mutex to avoid + * deadlock (e.g. call del_gendisk under the lock and an + * access to sysfs files waits the lock) + * And MD_DELETED is only used for md raid which is set in + * do_md_stop. dm raid only uses md_stop to stop. So dm raid + * doesn't need to check MD_DELETED when getting reconfig lock + */ + if (test_bit(MD_DELETED, &mddev->flags)) + del_gendisk(mddev->gendisk); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -5774,19 +5781,30 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; + struct kernfs_node *kn = NULL; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == array_state_store && cmd_match(page, "clear")) + kn = sysfs_break_active_protection(kobj, attr); + spin_lock(&all_mddevs_lock); if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); + if (kn) + sysfs_unbreak_active_protection(kn); return -EBUSY; } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -5794,12 +5812,6 @@ static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); - if (mddev->sysfs_state) - sysfs_put(mddev->sysfs_state); - if (mddev->sysfs_level) - sysfs_put(mddev->sysfs_level); - - del_gendisk(mddev->gendisk); put_disk(mddev->gendisk); } @@ -6413,15 +6425,10 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; - /* - * Don't clear MD_CLOSING, or mddev can be opened again. - * 'hold_active != 0' means mddev is still in the creation - * process and will be used later. - */ - if (mddev->hold_active) - mddev->flags = 0; - else - mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + /* if UNTIL_STOP is set, it's cleared here */ + mddev->hold_active = 0; + /* Don't clear MD_CLOSING, or mddev can be opened again. */ + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; @@ -6516,8 +6523,6 @@ static void __md_stop(struct mddev *mddev) if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - if (pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -6646,10 +6651,8 @@ static int do_md_stop(struct mddev *mddev, int mode) mddev->bitmap_info.offset = 0; export_array(mddev); - md_clean(mddev); - if (mddev->hold_active == UNTIL_STOP) - mddev->hold_active = 0; + set_bit(MD_DELETED, &mddev->flags); } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -9456,17 +9459,11 @@ static bool md_spares_need_change(struct mddev *mddev) return false; } -static int remove_and_add_spares(struct mddev *mddev, - struct md_rdev *this) +static int remove_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; - int spares = 0; int removed = 0; - if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - /* Mustn't remove devices when resync thread is running */ - return 0; - rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev_removeable(rdev) && !mddev->pers->hot_remove_disk(mddev, rdev)) { @@ -9480,6 +9477,21 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return removed; +} + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + + removed = remove_spares(mddev, this); if (this && removed) goto no_add; @@ -9522,6 +9534,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) /* Check if resync is in progress. */ if (mddev->recovery_cp < MaxSector) { + remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; diff --git a/drivers/md/md.h b/drivers/md/md.h index d45a9e6ead80..67b365621507 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -700,11 +700,26 @@ static inline bool reshape_interrupted(struct mddev *mddev) static inline int __must_check mddev_lock(struct mddev *mddev) { - return mutex_lock_interruptible(&mddev->reconfig_mutex); + int ret; + + ret = mutex_lock_interruptible(&mddev->reconfig_mutex); + + /* MD_DELETED is set in do_md_stop with reconfig_mutex. + * So check it here. + */ + if (!ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = -ENODEV; + mutex_unlock(&mddev->reconfig_mutex); + } + + return ret; } /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. + * It doesn't need to check MD_DELETED here, the owner which + * holds the lock here can't be stopped. And all paths can't + * call this function after do_md_stop. */ static inline void mddev_lock_nointr(struct mddev *mddev) { @@ -713,7 +728,14 @@ static inline void mddev_lock_nointr(struct mddev *mddev) static inline int mddev_trylock(struct mddev *mddev) { - return mutex_trylock(&mddev->reconfig_mutex); + int ret; + + ret = mutex_trylock(&mddev->reconfig_mutex); + if (!ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = -ENODEV; + mutex_unlock(&mddev->reconfig_mutex); + } + return ret; } extern void mddev_unlock(struct mddev *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d8f639f4ae12..cbe2a9054cb9 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -384,6 +384,7 @@ static int raid0_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; + lim.chunk_sectors = mddev->chunk_sectors; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c9bd2005bfd0..95dc354a86a0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2446,15 +2446,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * that are active */ for (i = 0; i < conf->copies; i++) { - int d; - tbio = r10_bio->devs[i].repl_bio; if (!tbio || !tbio->bi_end_io) continue; if (r10_bio->devs[i].bio->bi_end_io != end_sync_write && r10_bio->devs[i].bio != fbio) bio_copy_data(tbio, fbio); - d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); submit_bio_noacct(tbio); } @@ -4012,6 +4009,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; + lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ca5b0e8ba707..7ec61ee7b218 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -9040,7 +9040,7 @@ static int __init raid5_init(void) int ret; raid5_wq = alloc_workqueue("raid5wq", - WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_SYSFS, 0); if (!raid5_wq) return -ENOMEM; diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index b1fddfa33ab9..1286c31320e6 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -301,8 +301,8 @@ static void apple_nvme_submit_cmd(struct apple_nvme_queue *q, memcpy(&q->sqes[tag], cmd, sizeof(*cmd)); /* - * This lock here doesn't make much sense at a first glace but - * removing it will result in occasional missed completetion + * This lock here doesn't make much sense at a first glance but + * removing it will result in occasional missed completion * interrupts even though the commands still appear on the CQ. * It's unclear why this happens but our best guess is that * there is a bug in the firmware triggered when a new command diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index 1a0058be5821..dc90df9e13a2 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -133,7 +133,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached", [NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported", [NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid", - [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress", + [NVME_SC_SELF_TEST_IN_PROGRESS] = "Device Self-test In Progress", [NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited", [NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier", [NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State", @@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = { [NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes", [NVME_SC_INVALID_PI] = "Invalid Protection Information", [NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range", - [NVME_SC_CMD_SIZE_LIM_EXCEEDED ] = "Command Size Limits Exceeded", + [NVME_SC_CMD_SIZE_LIM_EXCEEDED] = "Command Size Limits Exceeded", [NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error", [NVME_SC_ZONE_FULL] = "Zone Is Full", [NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 29d6f7f85f74..9d988f4cb87a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4300,7 +4300,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) } /* - * If available try to use the Command Set Idependent Identify Namespace + * If available try to use the Command Set Independent Identify Namespace * data structure to find all the generic information that is needed to * set up a namespace. If not fall back to the legacy version. */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 014b387f1e8b..08a5ea3e9383 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -899,7 +899,7 @@ EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); * may crash. * * As such: - * Wrapper all the dma routines and check the dev pointer. + * Wrap all the dma routines and check the dev pointer. * * If simple mappings (return just a dma address, we'll noop them, * returning a dma address of 0. @@ -1955,8 +1955,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) } /* - * For the linux implementation, if we have an unsucceesful - * status, they blk-mq layer can typically be called with the + * For the linux implementation, if we have an unsuccessful + * status, the blk-mq layer can typically be called with the * non-zero status and the content of the cqe isn't important. */ if (status) @@ -2429,7 +2429,7 @@ static bool nvme_fc_terminate_exchange(struct request *req, void *data) /* * This routine runs through all outstanding commands on the association - * and aborts them. This routine is typically be called by the + * and aborts them. This routine is typically called by the * delete_association routine. It is also called due to an error during * reconnect. In that scenario, it is most likely a command that initializes * the controller, including fabric Connect commands on io queues, that @@ -2622,7 +2622,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, * as part of the exchange. The CQE is the last thing for the io, * which is transferred (explicitly or implicitly) with the RSP IU * sent on the exchange. After the CQE is received, the FC exchange is - * terminaed and the Exchange may be used on a different io. + * terminated and the Exchange may be used on a different io. * * The transport to LLDD api has the transport making a request for a * new fcp io request to the LLDD. The LLDD then allocates a FC exchange diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7df2ea21851f..cfd2b5b90b91 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -69,7 +69,7 @@ enum nvme_quirks { NVME_QUIRK_IDENTIFY_CNS = (1 << 1), /* - * The controller deterministically returns O's on reads to + * The controller deterministically returns 0's on reads to * logical blocks that deallocate was called on. */ NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2), diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 320aaa41ec39..071efec25346 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -7,7 +7,7 @@ #include <linux/acpi.h> #include <linux/async.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> +#include <linux/blk-mq-dma.h> #include <linux/blk-integrity.h> #include <linux/dmi.h> #include <linux/init.h> @@ -27,7 +27,6 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> -#include <linux/pci-p2pdma.h> #include "trace.h" #include "nvme.h" @@ -39,20 +38,17 @@ #define NVME_SMALL_POOL_SIZE 256 /* - * These can be higher, but we need to ensure that any command doesn't - * require an sg allocation that needs more than a page of data. + * Arbitrary upper bound. */ -#define NVME_MAX_KB_SZ 8192 +#define NVME_MAX_BYTES SZ_8M #define NVME_MAX_NR_DESCRIPTORS 5 /* - * For data SGLs we support a single descriptors worth of SGL entries, but for - * now we also limit it to avoid an allocation larger than PAGE_SIZE for the - * scatterlist. + * For data SGLs we support a single descriptors worth of SGL entries. + * For PRPs, segments don't matter at all. */ #define NVME_MAX_SEGS \ - min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \ - (PAGE_SIZE / sizeof(struct scatterlist))) + (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * For metadata SGLs, only the small descriptor is supported, and the first @@ -61,6 +57,21 @@ #define NVME_MAX_META_SEGS \ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1) +/* + * The last entry is used to link to the next descriptor. + */ +#define PRPS_PER_PAGE \ + (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1) + +/* + * I/O could be non-aligned both at the beginning and end. + */ +#define MAX_PRP_RANGE \ + (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1)) + +static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <= + (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE)); + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -97,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp) int ret; ret = kstrtouint(val, 10, &n); - if (ret != 0 || n > num_possible_cpus()) + if (ret != 0 || n > blk_mq_num_possible_queues(0)) return -EINVAL; return param_set_uint(val, kp); } @@ -162,7 +173,7 @@ struct nvme_dev { bool hmb; struct sg_table *hmb_sgt; - mempool_t *iod_mempool; + mempool_t *dmavec_mempool; mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ @@ -246,7 +257,15 @@ enum nvme_iod_flags { IOD_ABORTED = 1U << 0, /* uses the small descriptor pool */ - IOD_SMALL_DESCRIPTOR = 1U << 1, + IOD_SMALL_DESCRIPTOR = 1U << 1, + + /* single segment dma mapping */ + IOD_SINGLE_SEGMENT = 1U << 2, +}; + +struct nvme_dma_vec { + dma_addr_t addr; + unsigned int len; }; /* @@ -257,13 +276,16 @@ struct nvme_iod { struct nvme_command cmd; u8 flags; u8 nr_descriptors; - unsigned int dma_len; /* length of single DMA segment mapping */ - dma_addr_t first_dma; + + unsigned int total_len; + struct dma_iova_state dma_state; + void *descriptors[NVME_MAX_NR_DESCRIPTORS]; + struct nvme_dma_vec *dma_vecs; + unsigned int nr_dma_vecs; + dma_addr_t meta_dma; - struct sg_table sgt; struct sg_table meta_sgt; struct nvme_sgl_desc *meta_descriptor; - void *descriptors[NVME_MAX_NR_DESCRIPTORS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -406,18 +428,6 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, return true; } -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static __always_inline int nvme_pci_npages_prp(void) -{ - unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; - unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); -} - static struct nvme_descriptor_pools * nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) { @@ -578,32 +588,49 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } -static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, - struct request *req) +enum nvme_use_sgl { + SGL_UNSUPPORTED, + SGL_SUPPORTED, + SGL_FORCED, +}; + +static inline bool nvme_pci_metadata_use_sgls(struct request *req) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) return false; return req->nr_integrity_segments > 1 || nvme_req(req)->flags & NVME_REQ_USERCMD; } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, - int nseg) +static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev, + struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - unsigned int avg_seg_size; - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); + if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) { + if (nvme_req(req)->flags & NVME_REQ_USERCMD) + return SGL_FORCED; + if (req->nr_integrity_segments > 1) + return SGL_FORCED; + return SGL_SUPPORTED; + } - if (!nvme_ctrl_sgl_supported(&dev->ctrl)) - return false; - if (!nvmeq->qid) - return false; - if (nvme_pci_metadata_use_sgls(dev, req)) - return true; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return nvme_req(req)->flags & NVME_REQ_USERCMD; - return true; + return SGL_UNSUPPORTED; +} + +static unsigned int nvme_pci_avg_seg_size(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int nseg; + + if (blk_rq_dma_map_coalesce(&iod->dma_state)) + nseg = 1; + else + nseg = blk_rq_nr_phys_segments(req); + return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); } static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, @@ -614,11 +641,25 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } -static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) +static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) +{ + return cmd->common.flags & + (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG); +} + +static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd) { + if (nvme_pci_cmd_use_sgl(cmd)) + return le64_to_cpu(cmd->common.dptr.sgl.addr); + return le64_to_cpu(cmd->common.dptr.prp2); +} + +static void nvme_free_descriptors(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; + dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd); int i; if (iod->nr_descriptors == 1) { @@ -637,68 +678,130 @@ static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req) } } -static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq, - struct request *req) +static void nvme_free_prps(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int i; - if (iod->dma_len) { - dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, - rq_dma_dir(req)); + for (i = 0; i < iod->nr_dma_vecs; i++) + dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, rq_dma_dir(req)); + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); +} + +static void nvme_free_sgls(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); + unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); + struct nvme_sgl_desc *sg_list = iod->descriptors[0]; + enum dma_data_direction dir = rq_dma_dir(req); + + if (iod->nr_descriptors) { + unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; + + for (i = 0; i < nr_entries; i++) + dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir); + } else { + dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); + } +} + +static void nvme_unmap_data(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct device *dma_dev = nvmeq->dev->dev; + + if (iod->flags & IOD_SINGLE_SEGMENT) { + static_assert(offsetof(union nvme_data_ptr, prp1) == + offsetof(union nvme_data_ptr, sgl.addr)); + dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1), + iod->total_len, rq_dma_dir(req)); return; } - WARN_ON_ONCE(!iod->sgt.nents); + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (nvme_pci_cmd_use_sgl(&iod->cmd)) + nvme_free_sgls(req); + else + nvme_free_prps(req); + } - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - nvme_free_descriptors(nvmeq, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); + if (iod->nr_descriptors) + nvme_free_descriptors(req); } -static void nvme_print_sgl(struct scatterlist *sgl, int nents) +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, + struct blk_dma_iter *iter) { - int i; - struct scatterlist *sg; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); + if (iter->len) + return true; + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + return false; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len; + iod->nr_dma_vecs++; } + return true; } -static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, - struct request *req, struct nvme_rw_command *cmnd) +static blk_status_t nvme_pci_setup_data_prp(struct request *req, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int length = blk_rq_payload_bytes(req); + dma_addr_t prp1_dma, prp2_dma = 0; + unsigned int prp_len, i; __le64 *prp_list; - dma_addr_t prp_dma; - int i; - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; + if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) { + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool, + GFP_ATOMIC); + if (!iod->dma_vecs) + return BLK_STS_RESOURCE; + iod->dma_vecs[0].addr = iter->addr; + iod->dma_vecs[0].len = iter->len; + iod->nr_dma_vecs = 1; } - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); + /* + * PRP1 always points to the start of the DMA transfers. + * + * This is the only PRP (except for the list entries) that could be + * non-aligned. + */ + prp1_dma = iter->addr; + prp_len = min(length, NVME_CTRL_PAGE_SIZE - + (iter->addr & (NVME_CTRL_PAGE_SIZE - 1))); + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + goto done; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; } + /* + * PRP2 is usually a list, but can point to data if all data to be + * transferred fits into PRP1 + PRP2: + */ if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; + prp2_dma = iter->addr; + iod->total_len += length; goto done; } @@ -707,58 +810,80 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq, iod->flags |= IOD_SMALL_DESCRIPTOR; prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, - &prp_dma); - if (!prp_list) - return BLK_STS_RESOURCE; + &prp2_dma); + if (!prp_list) { + iter->status = BLK_STS_RESOURCE; + goto done; + } iod->descriptors[iod->nr_descriptors++] = prp_list; - iod->first_dma = prp_dma; + i = 0; for (;;) { + prp_list[i++] = cpu_to_le64(iter->addr); + prp_len = min(length, NVME_CTRL_PAGE_SIZE); + if (WARN_ON_ONCE(iter->len < prp_len)) + goto bad_sgl; + + iod->total_len += prp_len; + iter->addr += prp_len; + iter->len -= prp_len; + length -= prp_len; + if (!length) + break; + + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) { + if (WARN_ON_ONCE(!iter->status)) + goto bad_sgl; + goto done; + } + + /* + * If we've filled the entire descriptor, allocate a new that is + * pointed to be the last entry in the previous PRP list. To + * accommodate for that move the last actual entry to the new + * descriptor. + */ if (i == NVME_CTRL_PAGE_SIZE >> 3) { __le64 *old_prp_list = prp_list; + dma_addr_t prp_list_dma; prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, - GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; + GFP_ATOMIC, &prp_list_dma); + if (!prp_list) { + iter->status = BLK_STS_RESOURCE; + goto done; + } iod->descriptors[iod->nr_descriptors++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); + old_prp_list[i - 1] = cpu_to_le64(prp_list_dma); i = 1; } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); } + done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_descriptors(nvmeq, req); - return BLK_STS_RESOURCE; + /* + * nvme_unmap_data uses the DPT field in the SQE to tear down the + * mapping, so initialize it even for failures. + */ + iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); + iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); + if (unlikely(iter->status)) + nvme_unmap_data(req); + return iter->status; + bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); + dev_err_once(nvmeq->dev->dev, + "Incorrectly formed request for payload:%d nents:%d\n", + blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req)); return BLK_STS_IOERR; } static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) + struct blk_dma_iter *iter) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(iter->addr); + sge->length = cpu_to_le32(iter->len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -770,21 +895,22 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } -static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, - struct request *req, struct nvme_rw_command *cmd) +static blk_status_t nvme_pci_setup_data_sgl(struct request *req, + struct blk_dma_iter *iter) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + unsigned int entries = blk_rq_nr_phys_segments(req); struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; dma_addr_t sgl_dma; - int i = 0; + unsigned int mapped = 0; - /* setting the transfer type as SGL */ - cmd->flags = NVME_CMD_SGL_METABUF; + /* set the transfer type as SGL */ + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) { + nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter); + iod->total_len += iter->len; return BLK_STS_OK; } @@ -796,119 +922,104 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq, if (!sg_list) return BLK_STS_RESOURCE; iod->descriptors[iod->nr_descriptors++] = sg_list; - iod->first_dma = sgl_dma; - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); + if (WARN_ON_ONCE(mapped == entries)) { + iter->status = BLK_STS_IOERR; + break; + } + nvme_pci_sgl_set_data(&sg_list[mapped++], iter); + iod->total_len += iter->len; + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, + iter)); - return BLK_STS_OK; + nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); + if (unlikely(iter->status)) + nvme_free_sgls(req); + return iter->status; } -static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) +static blk_status_t nvme_pci_setup_data_simple(struct request *req, + enum nvme_use_sgl use_sgl) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1); - unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset; - - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct bio_vec bv = req_bvec(req); + unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1); + bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2; + dma_addr_t dma_addr; + + if (!use_sgl && !prp_possible) + return BLK_STS_AGAIN; + if (is_pci_p2pdma_page(bv.bv_page)) + return BLK_STS_AGAIN; + + dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, dma_addr)) return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; - - cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); - if (bv->bv_len > first_prp_len) - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); - else - cmnd->dptr.prp2 = 0; - return BLK_STS_OK; -} - -static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd, - struct bio_vec *bv) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + iod->total_len = bv.bv_len; + iod->flags |= IOD_SINGLE_SEGMENT; + + if (use_sgl == SGL_FORCED || !prp_possible) { + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; + iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len); + iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; + } else { + unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset; - iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->first_dma)) - return BLK_STS_RESOURCE; - iod->dma_len = bv->bv_len; + iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr); + iod->cmd.common.dptr.prp2 = 0; + if (bv.bv_len > first_prp_len) + iod->cmd.common.dptr.prp2 = + cpu_to_le64(dma_addr + first_prp_len); + } - cmnd->flags = NVME_CMD_SGL_METABUF; - cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma); - cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len); - cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4; return BLK_STS_OK; } -static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) +static blk_status_t nvme_map_data(struct request *req) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - blk_status_t ret = BLK_STS_RESOURCE; - int rc; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req); + struct blk_dma_iter iter; + blk_status_t ret; + /* + * Try to skip the DMA iterator for single segment requests, as that + * significantly improves performances for small I/O sizes. + */ if (blk_rq_nr_phys_segments(req) == 1) { - struct bio_vec bv = req_bvec(req); - - if (!is_pci_p2pdma_page(bv.bv_page)) { - if (!nvme_pci_metadata_use_sgls(dev, req) && - (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + - bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) - return nvme_setup_prp_simple(dev, req, - &cmnd->rw, &bv); - - if (nvmeq->qid && sgl_threshold && - nvme_ctrl_sgl_supported(&dev->ctrl)) - return nvme_setup_sgl_simple(dev, req, - &cmnd->rw, &bv); - } + ret = nvme_pci_setup_data_simple(req, use_sgl); + if (ret != BLK_STS_AGAIN) + return ret; } - iod->dma_len = 0; - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) - return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; + if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) + return iter.status; - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; - } - - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw); - else - ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; + if (use_sgl == SGL_FORCED || + (use_sgl == SGL_SUPPORTED && + (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) + return nvme_pci_setup_data_sgl(req, &iter); + return nvme_pci_setup_data_prp(req, &iter); +} -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); - return ret; +static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, + struct scatterlist *sg) +{ + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; } -static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, - struct request *req) +static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_rw_command *cmnd = &iod->cmd.rw; struct nvme_sgl_desc *sg_list; struct scatterlist *sgl, *sg; unsigned int entries; @@ -939,19 +1050,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - cmnd->flags = NVME_CMD_SGL_METASEG; - cmnd->metadata = cpu_to_le64(sgl_dma); + iod->cmd.common.flags = NVME_CMD_SGL_METASEG; + iod->cmd.common.metadata = cpu_to_le64(sgl_dma); sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data(sg_list, sgl); + nvme_pci_sgl_set_data_sg(sg_list, sgl); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data(&sg_list[i + 1], sg); + nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); return BLK_STS_OK; @@ -962,38 +1073,37 @@ out_free_sg: return BLK_STS_RESOURCE; } -static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, - struct request *req) +static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = rq_integrity_vec(req); - struct nvme_command *cmnd = &iod->cmd; - iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); - if (dma_mapping_error(dev->dev, iod->meta_dma)) + iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; - cmnd->rw.metadata = cpu_to_le64(iod->meta_dma); + iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); return BLK_STS_OK; } -static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_map_metadata(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) && - nvme_pci_metadata_use_sgls(dev, req)) - return nvme_pci_setup_meta_sgls(dev, req); - return nvme_pci_setup_meta_mptr(dev, req); + nvme_pci_metadata_use_sgls(req)) + return nvme_pci_setup_meta_sgls(req); + return nvme_pci_setup_meta_mptr(req); } -static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_prep_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; iod->flags = 0; iod->nr_descriptors = 0; - iod->sgt.nents = 0; + iod->total_len = 0; iod->meta_sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); @@ -1001,13 +1111,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &iod->cmd); + ret = nvme_map_data(req); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req); + ret = nvme_map_metadata(req); if (ret) goto out_unmap_data; } @@ -1016,7 +1126,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) return BLK_STS_OK; out_unmap_data: if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, req->mq_hctx->driver_data, req); + nvme_unmap_data(req); out_free_cmd: nvme_cleanup_cmd(req); return ret; @@ -1041,7 +1151,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!nvme_check_ready(&dev->ctrl, req, true))) return nvme_fail_nonready_command(&dev->ctrl, req); - ret = nvme_prep_rq(dev, req); + ret = nvme_prep_rq(req); if (unlikely(ret)) return ret; spin_lock(&nvmeq->sq_lock); @@ -1079,7 +1189,7 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req) if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true))) return false; - return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK; + return nvme_prep_rq(req) == BLK_STS_OK; } static void nvme_queue_rqs(struct rq_list *rqlist) @@ -1105,11 +1215,11 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, - struct nvme_queue *nvmeq, - struct request *req) +static __always_inline void nvme_unmap_metadata(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; if (!iod->meta_sgt.nents) { dma_unmap_page(dev->dev, iod->meta_dma, @@ -1126,14 +1236,10 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, static __always_inline void nvme_pci_unmap_rq(struct request *req) { - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - if (blk_integrity_rq(req)) - nvme_unmap_metadata(dev, nvmeq, req); - + nvme_unmap_metadata(req); if (blk_rq_nr_phys_segments(req)) - nvme_unmap_data(dev, nvmeq, req); + nvme_unmap_data(req); } static void nvme_pci_complete_rq(struct request *req) @@ -1958,8 +2064,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) * might be pointing at! */ result = nvme_disable_ctrl(&dev->ctrl, false); - if (result < 0) - return result; + if (result < 0) { + struct pci_dev *pdev = to_pci_dev(dev->dev); + + /* + * The NVMe Controller Reset method did not get an expected + * CSTS.RDY transition, so something with the device appears to + * be stuck. Use the lower level and bigger hammer PCIe + * Function Level Reset to attempt restoring the device to its + * initial state, and try again. + */ + result = pcie_reset_flr(pdev, false); + if (result < 0) + return result; + + pci_restore_state(pdev); + result = nvme_disable_ctrl(&dev->ctrl, false); + if (result < 0) + return result; + + dev_info(dev->ctrl.device, + "controller reset completed after pcie flr\n"); + } result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (result) @@ -2331,7 +2457,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, { struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n", ndev->cmbloc, ndev->cmbsz); } static DEVICE_ATTR_RO(cmb); @@ -2518,7 +2644,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev) */ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) return 1; - return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; + return blk_mq_num_possible_queues(0) + dev->nr_write_queues + + dev->nr_poll_queues; } static int nvme_setup_io_queues(struct nvme_dev *dev) @@ -2913,13 +3040,13 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); - size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; - dev->iod_mempool = mempool_create_node(1, + dev->dmavec_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)alloc_size, GFP_KERNEL, dev_to_node(dev->dev)); - if (!dev->iod_mempool) + if (!dev->dmavec_mempool) return -ENOMEM; dev->iod_meta_mempool = mempool_create_node(1, @@ -2928,10 +3055,9 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->iod_meta_mempool) goto free; - return 0; free: - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); return -ENOMEM; } @@ -3272,7 +3398,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, * over a single page. */ dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); + NVME_MAX_BYTES >> SECTOR_SHIFT, + dma_opt_mapping_size(&pdev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; dev->ctrl.max_integrity_segments = 1; return dev; @@ -3380,7 +3507,7 @@ out_disable: nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); out_release_iod_mempool: - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); @@ -3444,7 +3571,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove_admin(dev); nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); - mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->dmavec_mempool); mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); @@ -3847,7 +3974,6 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS); return pci_register_driver(&nvme_driver); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9bd3646568d0..190a4cfa8a5e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -877,7 +877,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) /* * Only start IO queues for which we have allocated the tagset - * and limitted it to the available queues. On reconnects, the + * and limited it to the available queues. On reconnects, the * queue number might have changed. */ nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d924008c3949..9233f088fac8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1745,9 +1745,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, qid, ret); tls_handshake_cancel(queue->sock->sk); } else { - dev_dbg(nctrl->device, - "queue %d: TLS handshake complete, error %d\n", - qid, queue->tls_err); + if (queue->tls_err) { + dev_err(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete\n", qid); + } ret = queue->tls_err; } return ret; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 175c5b6d4dd5..884286f90688 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -581,8 +581,6 @@ int nvmet_ns_enable(struct nvmet_ns *ns) if (ns->enabled) goto out_unlock; - ret = -EMFILE; - ret = nvmet_bdev_ns_enable(ns); if (ret == -ENOTBLK) ret = nvmet_file_ns_enable(ns); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b7515c53829b..3b4b0df8f879 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -106,7 +106,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) pctrl->max_hw_sectors); /* - * nvmet_passthru_map_sg is limitted to using a single bio so limit + * nvmet_passthru_map_sg is limited to using a single bio so limit * the mdts based on BIO_MAX_VECS as well */ max_hw_sectors = min_not_zero(BIO_MAX_VECS << PAGE_SECTORS_SHIFT, @@ -147,7 +147,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) * When passthru controller is setup using nvme-loop transport it will * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl() - * code path with duplicate ctr subsynqn. In order to prevent that we + * code path with duplicate ctrl subsysnqn. In order to prevent that we * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn. */ memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn)); diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index a4295a5b8d28..2e78397a7373 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1242,8 +1242,11 @@ static void nvmet_pci_epf_queue_response(struct nvmet_req *req) iod->status = le16_to_cpu(req->cqe->status) >> 1; - /* If we have no data to transfer, directly complete the command. */ - if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { + /* + * If the command failed or we have no data to transfer, complete the + * command immediately. + */ + if (iod->status || !iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { nvmet_pci_epf_complete_iod(iod); return; } @@ -1604,8 +1607,13 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) goto complete; } + /* + * If nvmet_req_init() fails (e.g., unsupported opcode) it will call + * __nvmet_req_complete() internally which will call + * nvmet_pci_epf_queue_response() and will complete the command directly. + */ if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops)) - goto complete; + return; iod->data_len = nvmet_req_transfer_len(req); if (iod->data_len) { @@ -1643,10 +1651,11 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) wait_for_completion(&iod->done); - if (iod->status == NVME_SC_SUCCESS) { - WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); - nvmet_pci_epf_transfer_iod_data(iod); - } + if (iod->status != NVME_SC_SUCCESS) + return; + + WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); + nvmet_pci_epf_transfer_iod_data(iod); complete: nvmet_pci_epf_complete_iod(iod); @@ -1860,7 +1869,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc); if (ctrl->io_cqes < sizeof(struct nvme_completion)) { dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n", - ctrl->io_sqes, sizeof(struct nvme_completion)); + ctrl->io_cqes, sizeof(struct nvme_completion)); goto err; } diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 29a60fabfcc8..15a579cf528c 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -541,7 +541,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) struct bio *bio; int sg_cnt; - /* Request is completed on len mismatch in nvmet_check_transter_len() */ + /* Request is completed on len mismatch in nvmet_check_transfer_len() */ if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) return; diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 9179f8aee964..615e06fd4ee8 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5971,7 +5971,8 @@ megasas_alloc_irq_vectors(struct megasas_instance *instance) else instance->iopoll_q_count = 0; - num_msix_req = num_online_cpus() + instance->low_latency_index_start; + num_msix_req = blk_mq_num_online_queues(0) + + instance->low_latency_index_start; instance->msix_vectors = min(num_msix_req, instance->msix_vectors); @@ -5987,7 +5988,8 @@ megasas_alloc_irq_vectors(struct megasas_instance *instance) /* Disable Balanced IOPS mode and try realloc vectors */ instance->perf_mode = MR_LATENCY_PERF_MODE; instance->low_latency_index_start = 1; - num_msix_req = num_online_cpus() + instance->low_latency_index_start; + num_msix_req = blk_mq_num_online_queues(0) + + instance->low_latency_index_start; instance->msix_vectors = min(num_msix_req, instance->msix_vectors); @@ -6243,7 +6245,7 @@ static int megasas_init_fw(struct megasas_instance *instance) intr_coalescing = (scratch_pad_1 & MR_INTR_COALESCING_SUPPORT_OFFSET) ? true : false; if (intr_coalescing && - (num_online_cpus() >= MR_HIGH_IOPS_QUEUE_COUNT) && + (blk_mq_num_online_queues(0) >= MR_HIGH_IOPS_QUEUE_COUNT) && (instance->msix_vectors == MEGASAS_MAX_MSIX_QUEUES)) instance->perf_mode = MR_BALANCED_PERF_MODE; else @@ -6287,7 +6289,8 @@ static int megasas_init_fw(struct megasas_instance *instance) else instance->low_latency_index_start = 1; - num_msix_req = num_online_cpus() + instance->low_latency_index_start; + num_msix_req = blk_mq_num_online_queues(0) + + instance->low_latency_index_start; instance->msix_vectors = min(num_msix_req, instance->msix_vectors); @@ -6319,8 +6322,8 @@ static int megasas_init_fw(struct megasas_instance *instance) megasas_setup_reply_map(instance); dev_info(&instance->pdev->dev, - "current msix/online cpus\t: (%d/%d)\n", - instance->msix_vectors, (unsigned int)num_online_cpus()); + "current msix/max num queues\t: (%d/%u)\n", + instance->msix_vectors, blk_mq_num_online_queues(0)); dev_info(&instance->pdev->dev, "RDPQ mode\t: (%s)\n", instance->is_rdpq ? "enabled" : "disabled"); diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index fe98c76e9be3..c4c6b5c6658c 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -4533,13 +4533,13 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp) if (USER_CTRL_IRQ(ha) || !ha->mqiobase) { /* user wants to control IRQ setting for target mode */ ret = pci_alloc_irq_vectors(ha->pdev, min_vecs, - min((u16)ha->msix_count, (u16)(num_online_cpus() + min_vecs)), - PCI_IRQ_MSIX); + blk_mq_num_online_queues(ha->msix_count) + min_vecs, + PCI_IRQ_MSIX); } else ret = pci_alloc_irq_vectors_affinity(ha->pdev, min_vecs, - min((u16)ha->msix_count, (u16)(num_online_cpus() + min_vecs)), - PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, - &desc); + blk_mq_num_online_queues(ha->msix_count) + min_vecs, + PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, + &desc); if (ret < 0) { ql_log(ql_log_fatal, vha, 0x00c7, diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 3d40a63e378d..125944941601 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -5294,15 +5294,14 @@ static void pqi_calculate_queue_resources(struct pqi_ctrl_info *ctrl_info) if (is_kdump_kernel()) { num_queue_groups = 1; } else { - int num_cpus; int max_queue_groups; max_queue_groups = min(ctrl_info->max_inbound_queues / 2, ctrl_info->max_outbound_queues - 1); max_queue_groups = min(max_queue_groups, PQI_MAX_QUEUE_GROUPS); - num_cpus = num_online_cpus(); - num_queue_groups = min(num_cpus, ctrl_info->max_msix_vectors); + num_queue_groups = + blk_mq_num_online_queues(ctrl_info->max_msix_vectors); num_queue_groups = min(num_queue_groups, max_queue_groups); } diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 21ce3e940192..96a69edddbe5 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -919,6 +919,7 @@ static int virtscsi_probe(struct virtio_device *vdev) /* We need to know how many queues before we allocate. */ num_queues = virtscsi_config_get(vdev, num_queues) ? : 1; num_queues = min_t(unsigned int, nr_cpu_ids, num_queues); + num_queues = blk_mq_num_possible_queues(num_queues); num_targets = virtscsi_config_get(vdev, max_target) + 1; diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 1f60c9d5cb18..a7b297dae489 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -329,20 +329,21 @@ create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; + unsigned int nr_masks; int j; - struct cpumask *result = group_cpus_evenly(this_vecs); + struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks); if (!result) { kfree(masks); return NULL; } - for (j = 0; j < this_vecs; j++) + for (j = 0; j < nr_masks; j++) cpumask_copy(&masks[curvec + j], &result[j]); kfree(result); - curvec += this_vecs; - usedvecs += this_vecs; + curvec += nr_masks; + usedvecs += nr_masks; } /* Fill out vectors at the end that don't need affinity */ diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 53c2626e90e7..3fbfb1a2942b 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -862,7 +862,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) { const struct cpumask *mask, *masks; - unsigned int q, cpu; + unsigned int q, cpu, nr_masks; /* First attempt to map using existing transport layer affinities * e.g. PCIe MSI-X @@ -882,7 +882,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f return; fallback: /* Attempt to map evenly in groups over the CPUs */ - masks = group_cpus_evenly(fs->num_request_queues); + masks = group_cpus_evenly(fs->num_request_queues, &nr_masks); /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) @@ -891,7 +891,7 @@ fallback: } for (q = 0; q < fs->num_request_queues; q++) { - for_each_cpu(cpu, &masks[q]) + for_each_cpu(cpu, &masks[q % nr_masks]) fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0b690bc119d7..2133fbaf1766 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -673,11 +673,6 @@ static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); } -static inline unsigned int max_pow_of_two_factor(const unsigned int nr) -{ - return 1 << (ffs(nr) - 1); -} - /* * If the underlying device advertises atomic write support, limit the size of * atomic writes to the greatest power-of-two factor of the group size so diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h new file mode 100644 index 000000000000..c26a01aeae00 --- /dev/null +++ b/include/linux/blk-mq-dma.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef BLK_MQ_DMA_H +#define BLK_MQ_DMA_H + +#include <linux/blk-mq.h> +#include <linux/pci-p2pdma.h> + +struct blk_dma_iter { + /* Output address range for this iteration */ + dma_addr_t addr; + u32 len; + + /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ + blk_status_t status; + + /* Internal to blk_rq_dma_map_iter_* */ + struct req_iterator iter; + struct pci_p2pdma_map_state p2pdma; +}; + +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter); +bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter); + +/** + * blk_rq_dma_map_coalesce - were all segments coalesced? + * @state: DMA state to check + * + * Returns true if blk_rq_dma_map_iter_start coalesced all segments into a + * single DMA range. + */ +static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) +{ + return dma_use_iova(state); +} + +/** + * blk_rq_dma_unmap - try to DMA unmap a request + * @req: request to unmap + * @dma_dev: device to unmap from + * @state: DMA IOVA state + * @mapped_len: number of bytes to unmap + * + * Returns %false if the callers need to manually unmap every DMA segment + * mapped using @iter or %true if no work is left to be done. + */ +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + if (req->cmd_flags & REQ_P2PDMA) + return true; + + if (dma_use_iova(state)) { + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), + 0); + return true; + } + + return !dma_need_unmap(dma_dev); +} + +#endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index de8c85a03bb7..2a5a828f19a0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -947,6 +947,8 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); void blk_freeze_queue_start_non_owner(struct request_queue *q); +unsigned int blk_mq_num_possible_queues(unsigned int max_queues); +unsigned int blk_mq_num_online_queues(unsigned int max_queues); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, struct device *dev, unsigned int offset); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3d1577f07c1c..09b99d52fd36 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -350,11 +350,11 @@ enum req_op { /* Close a zone */ REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11, /* Transition a zone to full */ - REQ_OP_ZONE_FINISH = (__force blk_opf_t)12, + REQ_OP_ZONE_FINISH = (__force blk_opf_t)13, /* reset a zone write pointer */ - REQ_OP_ZONE_RESET = (__force blk_opf_t)13, + REQ_OP_ZONE_RESET = (__force blk_opf_t)15, /* reset all the zone present on the device */ - REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15, + REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, @@ -386,6 +386,7 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ + __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -418,6 +419,7 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 48e6dc1f8889..95886b404b16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -846,6 +846,55 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return disk->nr_zones; } + +/** + * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone + * write plugging + * @bio: The BIO being submitted + * + * Return true whenever @bio execution needs to be handled through zone + * write plugging (using blk_zone_plug_bio()). Return false otherwise. + */ +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + enum req_op op = bio_op(bio); + + /* + * Only zoned block devices have a zone write plug hash table. But not + * all of them have one (e.g. DM devices may not need one). + */ + if (!bio->bi_bdev->bd_disk->zone_wplugs_hash) + return false; + + /* Only write operations need zone write plugging. */ + if (!op_is_write(op)) + return false; + + /* Ignore empty flush */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* Ignore BIOs that already have been handled by zone write plugging. */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * All zone write operations must be handled through zone write plugging + * using blk_zone_plug_bio(). + */ + switch (op) { + case REQ_OP_ZONE_APPEND: + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_ZONE_FINISH: + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + return true; + default: + return false; + } +} + bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); /** @@ -875,6 +924,12 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } + +static inline bool bio_needs_zone_write_plugging(struct bio *bio) +{ + return false; +} + static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { return false; @@ -1230,15 +1285,6 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -/* - * Default upper limit for the software max_sectors limit used for - * regular file system I/O. This can be increased through sysfs. - * - * Not to be confused with the max_hw_sector limit that is entirely - * controlled by the driver, usually based on hardware limits. - */ -#define BLK_DEF_MAX_SECTORS_CAP 2560u - static inline struct queue_limits *bdev_limits(struct block_device *bdev) { return &bdev_get_queue(bdev)->limits; diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index fdfb61ccf55a..b907e6c2307d 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -62,7 +62,6 @@ struct cdrom_device_info { __u8 last_sense; __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ unsigned short mmc3_profile; /* current MMC3 profile */ - int (*exit)(struct cdrom_device_info *); int mrw_mode_page; bool opened_for_data; __s64 last_media_change_ms; diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h index e42807ec61f6..9d4e5ab6c314 100644 --- a/include/linux/group_cpus.h +++ b/include/linux/group_cpus.h @@ -9,6 +9,6 @@ #include <linux/kernel.h> #include <linux/cpu.h> -struct cpumask *group_cpus_evenly(unsigned int numgrps); +struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks); #endif diff --git a/include/linux/log2.h b/include/linux/log2.h index 1366cb688a6d..2eac3fc9303d 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -255,4 +255,18 @@ int __bits_per(unsigned long n) ) : \ __bits_per(n) \ ) + +/** + * max_pow_of_two_factor - return highest power-of-2 factor + * @n: parameter + * + * find highest power-of-2 which is evenly divisible into n. + * 0 is returned for n == 0 or 1. + */ +static inline __attribute__((const)) +unsigned int max_pow_of_two_factor(unsigned int n) +{ + return n & -n; +} + #endif /* _LINUX_LOG2_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b65a1b9f2116..655d194f8e72 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2155,7 +2155,7 @@ enum { NVME_SC_NS_NOT_ATTACHED = 0x11a, NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, NVME_SC_CTRL_LIST_INVALID = 0x11c, - NVME_SC_SELT_TEST_IN_PROGRESS = 0x11d, + NVME_SC_SELF_TEST_IN_PROGRESS = 0x11d, NVME_SC_BP_WRITE_PROHIBITED = 0x11e, NVME_SC_CTRL_ID_INVALID = 0x11f, NVME_SC_SEC_CTRL_STATE_INVALID = 0x120, diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h deleted file mode 100644 index 2f1b952d596a..000000000000 --- a/include/linux/pktcdvd.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe <axboe@suse.de> - * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com> - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and - * DVD-RW devices. - * - */ -#ifndef __PKTCDVD_H -#define __PKTCDVD_H - -#include <linux/blkdev.h> -#include <linux/completion.h> -#include <linux/cdrom.h> -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/mempool.h> -#include <uapi/linux/pktcdvd.h> - -/* default bio write queue congestion marks */ -#define PKT_WRITE_CONGESTION_ON 10000 -#define PKT_WRITE_CONGESTION_OFF 9000 - - -struct packet_settings -{ - __u32 size; /* packet size in (512 byte) sectors */ - __u8 fp; /* fixed packets */ - __u8 link_loss; /* the rest is specified - * as per Mt Fuji */ - __u8 write_type; - __u8 track_mode; - __u8 block_mode; -}; - -/* - * Very crude stats for now - */ -struct packet_stats -{ - unsigned long pkt_started; - unsigned long pkt_ended; - unsigned long secs_w; - unsigned long secs_rg; - unsigned long secs_r; -}; - -struct packet_cdrw -{ - struct list_head pkt_free_list; - struct list_head pkt_active_list; - spinlock_t active_list_lock; /* Serialize access to pkt_active_list */ - struct task_struct *thread; - atomic_t pending_bios; -}; - -/* - * Switch to high speed reading after reading this many kilobytes - * with no interspersed writes. - */ -#define HI_SPEED_SWITCH 512 - -struct packet_iosched -{ - atomic_t attention; /* Set to non-zero when queue processing is needed */ - int writing; /* Non-zero when writing, zero when reading */ - spinlock_t lock; /* Protecting read/write queue manipulations */ - struct bio_list read_queue; - struct bio_list write_queue; - sector_t last_write; /* The sector where the last write ended */ - int successive_reads; -}; - -/* - * 32 buffers of 2048 bytes - */ -#if (PAGE_SIZE % CD_FRAMESIZE) != 0 -#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" -#endif -#define PACKET_MAX_SIZE 128 -#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) -#define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) - -enum packet_data_state { - PACKET_IDLE_STATE, /* Not used at the moment */ - PACKET_WAITING_STATE, /* Waiting for more bios to arrive, so */ - /* we don't have to do as much */ - /* data gathering */ - PACKET_READ_WAIT_STATE, /* Waiting for reads to fill in holes */ - PACKET_WRITE_WAIT_STATE, /* Waiting for the write to complete */ - PACKET_RECOVERY_STATE, /* Recover after read/write errors */ - PACKET_FINISHED_STATE, /* After write has finished */ - - PACKET_NUM_STATES /* Number of possible states */ -}; - -/* - * Information needed for writing a single packet - */ -struct pktcdvd_device; - -struct packet_data -{ - struct list_head list; - - spinlock_t lock; /* Lock protecting state transitions and */ - /* orig_bios list */ - - struct bio_list orig_bios; /* Original bios passed to pkt_make_request */ - /* that will be handled by this packet */ - int write_size; /* Total size of all bios in the orig_bios */ - /* list, measured in number of frames */ - - struct bio *w_bio; /* The bio we will send to the real CD */ - /* device once we have all data for the */ - /* packet we are going to write */ - sector_t sector; /* First sector in this packet */ - int frames; /* Number of frames in this packet */ - - enum packet_data_state state; /* Current state */ - atomic_t run_sm; /* Incremented whenever the state */ - /* machine needs to be run */ - long sleep_time; /* Set this to non-zero to make the state */ - /* machine run after this many jiffies. */ - - atomic_t io_wait; /* Number of pending IO operations */ - atomic_t io_errors; /* Number of read/write errors during IO */ - - struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */ - struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE]; - - int cache_valid; /* If non-zero, the data for the zone defined */ - /* by the sector variable is completely cached */ - /* in the pages[] vector. */ - - int id; /* ID number for debugging */ - struct pktcdvd_device *pd; -}; - -struct pkt_rb_node { - struct rb_node rb_node; - struct bio *bio; -}; - -struct packet_stacked_data -{ - struct bio *bio; /* Original read request bio */ - struct pktcdvd_device *pd; -}; -#define PSD_POOL_SIZE 64 - -struct pktcdvd_device -{ - struct file *bdev_file; /* dev attached */ - /* handle acquired for bdev during pkt_open_dev() */ - struct file *f_open_bdev; - dev_t pkt_dev; /* our dev */ - struct packet_settings settings; - struct packet_stats stats; - int refcnt; /* Open count */ - int write_speed; /* current write speed, kB/s */ - int read_speed; /* current read speed, kB/s */ - unsigned long offset; /* start offset */ - __u8 mode_offset; /* 0 / 8 */ - __u8 type; - unsigned long flags; - __u16 mmc3_profile; - __u32 nwa; /* next writable address */ - __u32 lra; /* last recorded address */ - struct packet_cdrw cdrw; - wait_queue_head_t wqueue; - - spinlock_t lock; /* Serialize access to bio_queue */ - struct rb_root bio_queue; /* Work queue of bios we need to handle */ - int bio_queue_size; /* Number of nodes in bio_queue */ - bool congested; /* Someone is waiting for bio_queue_size - * to drop. */ - sector_t current_sector; /* Keep track of where the elevator is */ - atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ - /* needs to be run. */ - mempool_t rb_pool; /* mempool for pkt_rb_node allocations */ - - struct packet_iosched iosched; - struct gendisk *disk; - - int write_congestion_off; - int write_congestion_on; - - struct device *dev; /* sysfs pktcdvd[0-7] dev */ - - struct dentry *dfs_d_root; /* debugfs: devname directory */ - struct dentry *dfs_f_info; /* debugfs: info file */ -}; - -#endif /* __PKTCDVD_H */ diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 14a924c0e303..6aa79e2d799c 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -11,7 +11,7 @@ #include <linux/tracepoint.h> #include <uapi/linux/ioprio.h> -#define RWBS_LEN 9 +#define RWBS_LEN 10 #define IOPRIO_CLASS_STRINGS \ { IOPRIO_CLASS_NONE, "none" }, \ @@ -405,6 +405,17 @@ DEFINE_EVENT(block_bio, block_getrq, ); /** + * blk_zone_append_update_request_bio - update bio sector after zone append + * @rq: the completed request that sets the bio sector + * + * Update the bio's bi_sector after a zone append command has been completed. + */ +DEFINE_EVENT(block_rq, blk_zone_append_update_request_bio, + TP_PROTO(struct request *rq), + TP_ARGS(rq) +); + +/** * block_plug - keep operations requests in request queue * @q: request queue to plug * @@ -588,6 +599,84 @@ TRACE_EVENT(block_rq_remap, (unsigned long long)__entry->old_sector, __entry->nr_bios) ); +/** + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones + * @bio: The block IO operation sent down to the device + * @nr_sectors: The number of sectors affected by this operation + * + * Execute a zone management operation on a specified range of zones. This + * range is encoded in %nr_sectors, which has to be a multiple of the zone + * size. + */ +TRACE_EVENT(blkdev_zone_mgmt, + + TP_PROTO(struct bio *bio, sector_t nr_sectors), + + TP_ARGS(bio, nr_sectors), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( sector_t, nr_sectors ) + __array( char, rwbs, RWBS_LEN) + ), + + TP_fast_assign( + __entry->dev = bio_dev(bio); + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sectors = bio_sectors(bio); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); + ), + + TP_printk("%d,%d %s %llu + %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sectors) +); + +DECLARE_EVENT_CLASS(block_zwplug, + + TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, + unsigned int nr_sectors), + + TP_ARGS(q, zno, sector, nr_sectors), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned int, zno ) + __field( sector_t, sector ) + __field( unsigned int, nr_sectors ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(q->disk); + __entry->zno = zno; + __entry->sector = sector; + __entry->nr_sectors = nr_sectors; + ), + + TP_printk("%d,%d zone %u, BIO %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->zno, + (unsigned long long)__entry->sector, + __entry->nr_sectors) +); + +DEFINE_EVENT(block_zwplug, disk_zone_wplug_add_bio, + + TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, + unsigned int nr_sectors), + + TP_ARGS(q, zno, sector, nr_sectors) +); + +DEFINE_EVENT(block_zwplug, blk_zone_wplug_bio, + + TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector, + unsigned int nr_sectors), + + TP_ARGS(q, zno, sector, nr_sectors) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c9751bdfd937..ec77dabba45b 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -301,6 +301,16 @@ */ #define UBLK_F_PER_IO_DAEMON (1ULL << 13) +/* + * If this feature is set, UBLK_U_IO_REGISTER_IO_BUF/UBLK_U_IO_UNREGISTER_IO_BUF + * can be issued for an I/O on any task. q_id and tag are also ignored in + * UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd. + * If it is unset, zero-copy buffers can only be registered and unregistered by + * the I/O's daemon task. The q_id and tag of the registered buffer are required + * in UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd. + */ +#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 44a4eba80315..4013e6ad2b2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * have multiple sets, build each sets affinity mask separately. */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { - unsigned int this_vecs = affd->set_size[i]; - int j; - struct cpumask *result = group_cpus_evenly(this_vecs); + unsigned int nr_masks, this_vecs = affd->set_size[i]; + struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks); if (!result) { kfree(masks); return NULL; } - for (j = 0; j < this_vecs; j++) + for (int j = 0; j < nr_masks; j++) cpumask_copy(&masks[curvec + j].mask, &result[j]); kfree(result); - curvec += this_vecs; - usedvecs += this_vecs; + curvec += nr_masks; + usedvecs += nr_masks; } /* Fill out vectors at the end that don't need affinity */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf..47168d2afbf1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) case REQ_OP_READ: rwbs[i++] = 'R'; break; + case REQ_OP_ZONE_APPEND: + rwbs[i++] = 'Z'; + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + rwbs[i++] = 'Z'; + rwbs[i++] = 'R'; + if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_FINISH: + rwbs[i++] = 'Z'; + rwbs[i++] = 'F'; + break; + case REQ_OP_ZONE_OPEN: + rwbs[i++] = 'Z'; + rwbs[i++] = 'O'; + break; + case REQ_OP_ZONE_CLOSE: + rwbs[i++] = 'Z'; + rwbs[i++] = 'C'; + break; default: rwbs[i++] = 'N'; } @@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) if (opf & REQ_ATOMIC) rwbs[i++] = 'U'; + WARN_ON_ONCE(i >= RWBS_LEN); + rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 18d43a406114..6d08ac05f371 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -332,9 +332,11 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups + * @nummasks: number of initialized cpumasks * * Return: cpumask array if successful, NULL otherwise. And each element - * includes CPUs assigned to this group + * includes CPUs assigned to this group. nummasks contains the number + * of initialized masks which can be less than numgrps. * * Try to put close CPUs from viewpoint of CPU and NUMA locality into * same group, and run two-stage grouping: @@ -344,7 +346,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, * We guarantee in the resulted grouping that all CPUs are covered, and * no same CPU is assigned to multiple groups */ -struct cpumask *group_cpus_evenly(unsigned int numgrps) +struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; @@ -389,7 +391,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret < 0) - goto fail_build_affinity; + goto fail_node_to_cpumask; nr_present = ret; /* @@ -408,10 +410,6 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) if (ret >= 0) nr_others = ret; - fail_build_affinity: - if (ret >= 0) - WARN_ON(nr_present + nr_others < numgrps); - fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); @@ -424,10 +422,11 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) kfree(masks); return NULL; } + *nummasks = min(nr_present + nr_others, numgrps); return masks; } #else /* CONFIG_SMP */ -struct cpumask *group_cpus_evenly(unsigned int numgrps) +struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) { struct cpumask *masks; @@ -440,6 +439,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) /* assign all CPUs(cpu 0) to the 1st group only */ cpumask_copy(&masks[0], cpu_possible_mask); + *nummasks = 1; return masks; } #endif /* CONFIG_SMP */ diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index 6e60f7d97125..b227bd78b252 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -38,7 +38,8 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, return 0; } -static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag) +static int ublk_fault_inject_queue_io(struct ublk_thread *t, + struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe; @@ -46,25 +47,27 @@ static int ublk_fault_inject_queue_io(struct ublk_queue *q, int tag) .tv_nsec = (long long)q->dev->private_data, }; - ublk_io_alloc_sqes(ublk_get_io(q, tag), &sqe, 1); + ublk_io_alloc_sqes(t, &sqe, 1); io_uring_prep_timeout(sqe, &ts, 1, 0); sqe->user_data = build_user_data(tag, ublksrv_get_op(iod), 0, q->q_id, 1); - ublk_queued_tgt_io(q, tag, 1); + ublk_queued_tgt_io(t, q, tag, 1); return 0; } -static void ublk_fault_inject_tgt_io_done(struct ublk_queue *q, int tag, +static void ublk_fault_inject_tgt_io_done(struct ublk_thread *t, + struct ublk_queue *q, const struct io_uring_cqe *cqe) { + unsigned tag = user_data_to_tag(cqe->user_data); const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); if (cqe->res != -ETIME) ublk_err("%s: unexpected cqe res %d\n", __func__, cqe->res); - if (ublk_completed_tgt_io(q, tag)) - ublk_complete_io(q, tag, iod->nr_sectors << 9); + if (ublk_completed_tgt_io(t, q, tag)) + ublk_complete_io(t, q, tag, iod->nr_sectors << 9); else ublk_err("%s: io not complete after 1 cqe\n", __func__); } diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index cfa59b631693..2d93ac860bd5 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -13,12 +13,13 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int assert(0); } -static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) { unsigned ublk_op = ublksrv_get_op(iod); struct io_uring_sqe *sqe[1]; - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1); + ublk_io_alloc_sqes(t, sqe, 1); io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ @@ -26,7 +27,8 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des return 1; } -static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) { unsigned ublk_op = ublksrv_get_op(iod); unsigned zc = ublk_queue_use_zc(q); @@ -36,7 +38,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr; if (!zc || auto_zc) { - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1); + ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) return -ENOMEM; @@ -52,7 +54,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de return 1; } - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3); + ublk_io_alloc_sqes(t, sqe, 3); io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; @@ -72,7 +74,7 @@ static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_de return 2; } -static int loop_queue_tgt_io(struct ublk_queue *q, int tag) +static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned ublk_op = ublksrv_get_op(iod); @@ -80,7 +82,7 @@ static int loop_queue_tgt_io(struct ublk_queue *q, int tag) switch (ublk_op) { case UBLK_IO_OP_FLUSH: - ret = loop_queue_flush_io(q, iod, tag); + ret = loop_queue_flush_io(t, q, iod, tag); break; case UBLK_IO_OP_WRITE_ZEROES: case UBLK_IO_OP_DISCARD: @@ -88,7 +90,7 @@ static int loop_queue_tgt_io(struct ublk_queue *q, int tag) break; case UBLK_IO_OP_READ: case UBLK_IO_OP_WRITE: - ret = loop_queue_tgt_rw_io(q, iod, tag); + ret = loop_queue_tgt_rw_io(t, q, iod, tag); break; default: ret = -EINVAL; @@ -100,17 +102,19 @@ static int loop_queue_tgt_io(struct ublk_queue *q, int tag) return ret; } -static int ublk_loop_queue_io(struct ublk_queue *q, int tag) +static int ublk_loop_queue_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { - int queued = loop_queue_tgt_io(q, tag); + int queued = loop_queue_tgt_io(t, q, tag); - ublk_queued_tgt_io(q, tag, queued); + ublk_queued_tgt_io(t, q, tag, queued); return 0; } -static void ublk_loop_io_done(struct ublk_queue *q, int tag, +static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, const struct io_uring_cqe *cqe) { + unsigned tag = user_data_to_tag(cqe->user_data); unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); @@ -126,8 +130,8 @@ static void ublk_loop_io_done(struct ublk_queue *q, int tag, if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) io->tgt_ios += 1; - if (ublk_completed_tgt_io(q, tag)) - ublk_complete_io(q, tag, io->result); + if (ublk_completed_tgt_io(t, q, tag)) + ublk_complete_io(t, q, tag, io->result); } static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e2d2042810d4..95188065b2e9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -441,17 +441,10 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) unsigned long off; q->tgt_ops = dev->tgt.ops; - q->state = 0; + q->flags = 0; q->q_depth = depth; - - if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) { - q->state |= UBLKSRV_NO_BUF; - if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) - q->state |= UBLKSRV_ZC; - if (dev->dev_info.flags & UBLK_F_AUTO_BUF_REG) - q->state |= UBLKSRV_AUTO_BUF_REG; - } - q->state |= extra_flags; + q->flags = dev->dev_info.flags; + q->flags |= extra_flags; cmd_buf_size = ublk_queue_cmd_buf_sz(q); off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); @@ -466,10 +459,10 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags) io_buf_size = dev->dev_info.max_io_buf_bytes; for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; - q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; + q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; - if (q->state & UBLKSRV_NO_BUF) + if (ublk_queue_no_buf(q)) continue; if (posix_memalign((void **)&q->ios[i].buf_addr, @@ -583,15 +576,14 @@ static void ublk_set_auto_buf_reg(const struct ublk_queue *q, else buf.index = q->ios[tag].buf_index; - if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK) + if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf); } -int ublk_queue_io_cmd(struct ublk_io *io) +int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) { - struct ublk_thread *t = io->t; struct ublk_queue *q = ublk_io_to_queue(io); struct ublksrv_io_cmd *cmd; struct io_uring_sqe *sqe[1]; @@ -599,7 +591,7 @@ int ublk_queue_io_cmd(struct ublk_io *io) __u64 user_data; /* only freed io can be issued */ - if (!(io->flags & UBLKSRV_IO_FREE)) + if (!(io->flags & UBLKS_IO_FREE)) return 0; /* @@ -607,20 +599,20 @@ int ublk_queue_io_cmd(struct ublk_io *io) * getting data */ if (!(io->flags & - (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_NEED_GET_DATA))) + (UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_NEED_GET_DATA))) return 0; - if (io->flags & UBLKSRV_NEED_GET_DATA) + if (io->flags & UBLKS_IO_NEED_GET_DATA) cmd_op = UBLK_U_IO_NEED_GET_DATA; - else if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) + else if (io->flags & UBLKS_IO_NEED_COMMIT_RQ_COMP) cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; - else if (io->flags & UBLKSRV_NEED_FETCH_RQ) + else if (io->flags & UBLKS_IO_NEED_FETCH_RQ) cmd_op = UBLK_U_IO_FETCH_REQ; if (io_uring_sq_space_left(&t->ring) < 1) io_uring_submit(&t->ring); - ublk_io_alloc_sqes(io, sqe, 1); + ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) { ublk_err("%s: run out of sqe. thread %u, tag %d\n", __func__, t->idx, io->tag); @@ -640,12 +632,12 @@ int ublk_queue_io_cmd(struct ublk_io *io) sqe[0]->rw_flags = 0; cmd->tag = io->tag; cmd->q_id = q->q_id; - if (!(q->state & UBLKSRV_NO_BUF)) + if (!ublk_queue_no_buf(q)) cmd->addr = (__u64) (uintptr_t) io->buf_addr; else cmd->addr = 0; - if (q->state & UBLKSRV_AUTO_BUF_REG) + if (ublk_queue_use_auto_zc(q)) ublk_set_auto_buf_reg(q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); @@ -657,7 +649,7 @@ int ublk_queue_io_cmd(struct ublk_io *io) ublk_dbg(UBLK_DBG_IO_CMD, "%s: (thread %u qid %d tag %u cmd_op %u) iof %x stopping %d\n", __func__, t->idx, q->q_id, io->tag, cmd_op, - io->flags, !!(t->state & UBLKSRV_THREAD_STOPPING)); + io->flags, !!(t->state & UBLKS_T_STOPPING)); return 1; } @@ -685,9 +677,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) int tag = i % dinfo->queue_depth; q = &t->dev->q[q_id]; io = &q->ios[tag]; - io->t = t; io->buf_index = j++; - ublk_queue_io_cmd(io); + ublk_queue_io_cmd(t, io); } } else { /* @@ -697,9 +688,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) struct ublk_queue *q = &t->dev->q[t->idx]; for (i = 0; i < q->q_depth; i++) { io = &q->ios[i]; - io->t = t; io->buf_index = i; - ublk_queue_io_cmd(io); + ublk_queue_io_cmd(t, io); } } } @@ -711,14 +701,13 @@ static int ublk_thread_is_idle(struct ublk_thread *t) static int ublk_thread_is_done(struct ublk_thread *t) { - return (t->state & UBLKSRV_THREAD_STOPPING) && ublk_thread_is_idle(t); + return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t); } -static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, - struct io_uring_cqe *cqe) +static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, + struct ublk_queue *q, + struct io_uring_cqe *cqe) { - unsigned tag = user_data_to_tag(cqe->user_data); - if (cqe->res < 0 && cqe->res != -EAGAIN) ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", __func__, cqe->res, q->q_id, @@ -726,7 +715,41 @@ static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, user_data_to_op(cqe->user_data)); if (q->tgt_ops->tgt_io_done) - q->tgt_ops->tgt_io_done(q, tag, cqe); + q->tgt_ops->tgt_io_done(t, q, cqe); +} + +static void ublk_handle_uring_cmd(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && + !(t->state & UBLKS_T_STOPPING); + unsigned tag = user_data_to_tag(cqe->user_data); + struct ublk_io *io = &q->ios[tag]; + + if (!fetch) { + t->state |= UBLKS_T_STOPPING; + io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; + } + + if (cqe->res == UBLK_IO_RES_OK) { + assert(tag < q->q_depth); + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { + io->flags |= UBLKS_IO_NEED_GET_DATA | UBLKS_IO_FREE; + ublk_queue_io_cmd(t, io); + } else { + /* + * COMMIT_REQ will be completed immediately since no fetching + * piggyback is required. + * + * Marking IO_FREE only, then this io won't be issued since + * we only issue io with (UBLKS_IO_FREE | UBLKSRV_NEED_*) + * + * */ + io->flags = UBLKS_IO_FREE; + } } static void ublk_handle_cqe(struct ublk_thread *t, @@ -735,54 +758,27 @@ static void ublk_handle_cqe(struct ublk_thread *t, struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); struct ublk_queue *q = &dev->q[q_id]; - unsigned tag = user_data_to_tag(cqe->user_data); unsigned cmd_op = user_data_to_op(cqe->user_data); - int fetch = (cqe->res != UBLK_IO_RES_ABORT) && - !(t->state & UBLKSRV_THREAD_STOPPING); - struct ublk_io *io; if (cqe->res < 0 && cqe->res != -ENODEV) ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->state); + cqe->res, cqe->user_data, q->flags); ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, tag, cmd_op, - is_target_io(cqe->user_data), + __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), + cmd_op, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), - (t->state & UBLKSRV_THREAD_STOPPING)); + (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(q, cqe); + ublksrv_handle_tgt_cqe(t, q, cqe); return; } - io = &q->ios[tag]; t->cmd_inflight--; - if (!fetch) { - t->state |= UBLKSRV_THREAD_STOPPING; - io->flags &= ~UBLKSRV_NEED_FETCH_RQ; - } - - if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); - if (q->tgt_ops->queue_io) - q->tgt_ops->queue_io(q, tag); - } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) { - io->flags |= UBLKSRV_NEED_GET_DATA | UBLKSRV_IO_FREE; - ublk_queue_io_cmd(io); - } else { - /* - * COMMIT_REQ will be completed immediately since no fetching - * piggyback is required. - * - * Marking IO_FREE only, then this io won't be issued since - * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) - * - * */ - io->flags = UBLKSRV_IO_FREE; - } + ublk_handle_uring_cmd(t, q, cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -808,7 +804,7 @@ static int ublk_process_io(struct ublk_thread *t) t->dev->dev_info.dev_id, t->idx, io_uring_sq_ready(&t->ring), t->cmd_inflight, - (t->state & UBLKSRV_THREAD_STOPPING)); + (t->state & UBLKS_T_STOPPING)); if (ublk_thread_is_done(t)) return -ENODEV; @@ -817,8 +813,8 @@ static int ublk_process_io(struct ublk_thread *t) reapped = ublk_reap_events_uring(t); ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", - ret, reapped, (t->state & UBLKSRV_THREAD_STOPPING), - (t->state & UBLKSRV_THREAD_IDLE)); + ret, reapped, (t->state & UBLKS_T_STOPPING), + (t->state & UBLKS_T_IDLE)); return reapped; } @@ -915,7 +911,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; struct ublk_thread_info *tinfo; - unsigned extra_flags = 0; + unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; void *thread_ret; sem_t ready; @@ -937,7 +933,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) return ret; if (ctx->auto_zc_fallback) - extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK; + extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; for (i = 0; i < dinfo->nr_hw_queues; i++) { dev->q[i].dev = dev; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 6be601536b3d..219233f8a053 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -29,13 +29,9 @@ #include "ublk_dep.h" #include <linux/ublk_cmd.h> -#define __maybe_unused __attribute__((unused)) -#define MAX_BACK_FILES 4 -#ifndef min -#define min(a, b) ((a) < (b) ? (a) : (b)) -#endif +#include "utils.h" -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) +#define MAX_BACK_FILES 4 /****************** part 1: libublk ********************/ @@ -45,9 +41,6 @@ #define UBLK_CTRL_RING_DEPTH 32 #define ERROR_EVTFD_DEVID -2 -/* queue idle timeout */ -#define UBLKSRV_IO_IDLE_SECS 20 - #define UBLK_IO_MAX_BYTES (1 << 20) #define UBLK_MAX_QUEUES_SHIFT 5 #define UBLK_MAX_QUEUES (1 << UBLK_MAX_QUEUES_SHIFT) @@ -55,13 +48,6 @@ #define UBLK_MAX_THREADS (1 << UBLK_MAX_THREADS_SHIFT) #define UBLK_QUEUE_DEPTH 1024 -#define UBLK_DBG_DEV (1U << 0) -#define UBLK_DBG_THREAD (1U << 1) -#define UBLK_DBG_IO_CMD (1U << 2) -#define UBLK_DBG_IO (1U << 3) -#define UBLK_DBG_CTRL_CMD (1U << 4) -#define UBLK_LOG (1U << 5) - struct ublk_dev; struct ublk_queue; struct ublk_thread; @@ -121,11 +107,11 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; -#define UBLKSRV_NEED_FETCH_RQ (1UL << 0) -#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) -#define UBLKSRV_IO_FREE (1UL << 2) -#define UBLKSRV_NEED_GET_DATA (1UL << 3) -#define UBLKSRV_NEED_REG_BUF (1UL << 4) +#define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) +#define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) +#define UBLKS_IO_FREE (1UL << 2) +#define UBLKS_IO_NEED_GET_DATA (1UL << 3) +#define UBLKS_IO_NEED_REG_BUF (1UL << 4) unsigned short flags; unsigned short refs; /* used by target code only */ @@ -136,7 +122,6 @@ struct ublk_io { unsigned short buf_index; unsigned short tgt_ios; void *private_data; - struct ublk_thread *t; }; struct ublk_tgt_ops { @@ -144,9 +129,9 @@ struct ublk_tgt_ops { int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); void (*deinit_tgt)(struct ublk_dev *); - int (*queue_io)(struct ublk_queue *, int tag); - void (*tgt_io_done)(struct ublk_queue *, - int tag, const struct io_uring_cqe *); + int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag); + void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *, + const struct io_uring_cqe *); /* * Target specific command line handling @@ -179,12 +164,10 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; +/* borrow one bit of ublk uapi flags, which may never be used */ +#define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) + __u64 flags; struct ublk_io ios[UBLK_QUEUE_DEPTH]; -#define UBLKSRV_NO_BUF (1U << 2) -#define UBLKSRV_ZC (1U << 3) -#define UBLKSRV_AUTO_BUF_REG (1U << 4) -#define UBLKSRV_AUTO_BUF_REG_FALLBACK (1U << 5) - unsigned state; }; struct ublk_thread { @@ -196,8 +179,8 @@ struct ublk_thread { pthread_t thread; unsigned idx; -#define UBLKSRV_THREAD_STOPPING (1U << 0) -#define UBLKSRV_THREAD_IDLE (1U << 1) +#define UBLKS_T_STOPPING (1U << 0) +#define UBLKS_T_IDLE (1U << 1) unsigned state; }; @@ -217,22 +200,7 @@ struct ublk_dev { void *private_data; }; -#ifndef offsetof -#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) -#endif - -#ifndef container_of -#define container_of(ptr, type, member) ({ \ - unsigned long __mptr = (unsigned long)(ptr); \ - ((type *)(__mptr - offsetof(type, member))); }) -#endif - -#define round_up(val, rnd) \ - (((val) + ((rnd) - 1)) & ~((rnd) - 1)) - - -extern unsigned int ublk_dbg_mask; -extern int ublk_queue_io_cmd(struct ublk_io *io); +extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) @@ -281,43 +249,15 @@ static inline unsigned short ublk_cmd_op_nr(unsigned int op) return _IOC_NR(op); } -static inline void ublk_err(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); -} - -static inline void ublk_log(const char *fmt, ...) -{ - if (ublk_dbg_mask & UBLK_LOG) { - va_list ap; - - va_start(ap, fmt); - vfprintf(stdout, fmt, ap); - } -} - -static inline void ublk_dbg(int level, const char *fmt, ...) -{ - if (level & ublk_dbg_mask) { - va_list ap; - - va_start(ap, fmt); - vfprintf(stdout, fmt, ap); - } -} - static inline struct ublk_queue *ublk_io_to_queue(const struct ublk_io *io) { return container_of(io, struct ublk_queue, ios[io->tag]); } -static inline int ublk_io_alloc_sqes(struct ublk_io *io, +static inline int ublk_io_alloc_sqes(struct ublk_thread *t, struct io_uring_sqe *sqes[], int nr_sqes) { - struct io_uring *ring = &io->t->ring; + struct io_uring *ring = &t->ring; unsigned left = io_uring_sq_space_left(ring); int i; @@ -380,7 +320,7 @@ static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) static inline void ublk_mark_io_done(struct ublk_io *io, int res) { - io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); + io->flags |= (UBLKS_IO_NEED_COMMIT_RQ_COMP | UBLKS_IO_FREE); io->result = res; } @@ -402,45 +342,58 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) { struct ublk_io *io = &q->ios[tag]; ublk_mark_io_done(io, res); - return ublk_queue_io_cmd(io); + return ublk_queue_io_cmd(t, io); } -static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued) +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) { if (queued < 0) - ublk_complete_io(q, tag, queued); + ublk_complete_io(t, q, tag, queued); else { struct ublk_io *io = ublk_get_io(q, tag); - io->t->io_inflight += queued; + t->io_inflight += queued; io->tgt_ios = queued; io->result = 0; } } -static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag) +static inline int ublk_completed_tgt_io(struct ublk_thread *t, + struct ublk_queue *q, unsigned tag) { struct ublk_io *io = ublk_get_io(q, tag); - io->t->io_inflight--; + t->io_inflight--; return --io->tgt_ios == 0; } static inline int ublk_queue_use_zc(const struct ublk_queue *q) { - return q->state & UBLKSRV_ZC; + return q->flags & UBLK_F_SUPPORT_ZERO_COPY; } static inline int ublk_queue_use_auto_zc(const struct ublk_queue *q) { - return q->state & UBLKSRV_AUTO_BUF_REG; + return q->flags & UBLK_F_AUTO_BUF_REG; +} + +static inline int ublk_queue_auto_zc_fallback(const struct ublk_queue *q) +{ + return q->flags & UBLKS_Q_AUTO_BUF_REG_FALLBACK; +} + +static inline int ublk_queue_no_buf(const struct ublk_queue *q) +{ + return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } extern const struct ublk_tgt_ops null_tgt_ops; @@ -451,10 +404,4 @@ extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); int backing_file_tgt_init(struct ublk_dev *dev); -static inline unsigned int ilog2(unsigned int x) -{ - if (x == 0) - return 0; - return (sizeof(x) * 8 - 1) - __builtin_clz(x); -} #endif diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index afe0b99d77ee..f0e0003a4860 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -55,12 +55,13 @@ static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, sqe->user_data = build_user_data(tag, ublk_op, 0, q_id, 1); } -static int null_queue_zc_io(struct ublk_queue *q, int tag) +static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3); + ublk_io_alloc_sqes(t, sqe, 3); io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); sqe[0]->user_data = build_user_data(tag, @@ -77,19 +78,21 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag) return 2; } -static int null_queue_auto_zc_io(struct ublk_queue *q, int tag) +static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[1]; - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 1); + ublk_io_alloc_sqes(t, sqe, 1); __setup_nop_io(tag, iod, sqe[0], q->q_id); return 1; } -static void ublk_null_io_done(struct ublk_queue *q, int tag, - const struct io_uring_cqe *cqe) +static void ublk_null_io_done(struct ublk_thread *t, struct ublk_queue *q, + const struct io_uring_cqe *cqe) { + unsigned tag = user_data_to_tag(cqe->user_data); unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); @@ -105,11 +108,12 @@ static void ublk_null_io_done(struct ublk_queue *q, int tag, if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) io->tgt_ios += 1; - if (ublk_completed_tgt_io(q, tag)) - ublk_complete_io(q, tag, io->result); + if (ublk_completed_tgt_io(t, q, tag)) + ublk_complete_io(t, q, tag, io->result); } -static int ublk_null_queue_io(struct ublk_queue *q, int tag) +static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned auto_zc = ublk_queue_use_auto_zc(q); @@ -117,14 +121,14 @@ static int ublk_null_queue_io(struct ublk_queue *q, int tag) int queued; if (auto_zc && !ublk_io_auto_zc_fallback(iod)) - queued = null_queue_auto_zc_io(q, tag); + queued = null_queue_auto_zc_io(t, q, tag); else if (zc) - queued = null_queue_zc_io(q, tag); + queued = null_queue_zc_io(t, q, tag); else { - ublk_complete_io(q, tag, iod->nr_sectors << 9); + ublk_complete_io(t, q, tag, iod->nr_sectors << 9); return 0; } - ublk_queued_tgt_io(q, tag, queued); + ublk_queued_tgt_io(t, q, tag, queued); return 0; } @@ -134,7 +138,7 @@ static int ublk_null_queue_io(struct ublk_queue *q, int tag) */ static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) { - if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK) + if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; return q->ios[tag].buf_index; } diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 37d50bbf5f5e..1fb9b7cc281b 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -123,7 +123,8 @@ static inline enum io_uring_op stripe_to_uring_op( assert(0); } -static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) { const struct stripe_conf *conf = get_chunk_shift(q); unsigned auto_zc = (ublk_queue_use_auto_zc(q) != 0); @@ -138,7 +139,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_ io->private_data = s; calculate_stripe_array(conf, iod, s, base); - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra); + ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index); @@ -176,13 +177,14 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_ return s->nr + zc; } -static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +static int handle_flush(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) { const struct stripe_conf *conf = get_chunk_shift(q); struct io_uring_sqe *sqe[NR_STRIPE]; int i; - ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, conf->nr_files); + ublk_io_alloc_sqes(t, sqe, conf->nr_files); for (i = 0; i < conf->nr_files; i++) { io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); @@ -191,7 +193,8 @@ static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, return conf->nr_files; } -static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) +static int stripe_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned ublk_op = ublksrv_get_op(iod); @@ -199,7 +202,7 @@ static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) switch (ublk_op) { case UBLK_IO_OP_FLUSH: - ret = handle_flush(q, iod, tag); + ret = handle_flush(t, q, iod, tag); break; case UBLK_IO_OP_WRITE_ZEROES: case UBLK_IO_OP_DISCARD: @@ -207,7 +210,7 @@ static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) break; case UBLK_IO_OP_READ: case UBLK_IO_OP_WRITE: - ret = stripe_queue_tgt_rw_io(q, iod, tag); + ret = stripe_queue_tgt_rw_io(t, q, iod, tag); break; default: ret = -EINVAL; @@ -218,17 +221,19 @@ static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) return ret; } -static int ublk_stripe_queue_io(struct ublk_queue *q, int tag) +static int ublk_stripe_queue_io(struct ublk_thread *t, struct ublk_queue *q, + int tag) { - int queued = stripe_queue_tgt_io(q, tag); + int queued = stripe_queue_tgt_io(t, q, tag); - ublk_queued_tgt_io(q, tag, queued); + ublk_queued_tgt_io(t, q, tag, queued); return 0; } -static void ublk_stripe_io_done(struct ublk_queue *q, int tag, - const struct io_uring_cqe *cqe) +static void ublk_stripe_io_done(struct ublk_thread *t, struct ublk_queue *q, + const struct io_uring_cqe *cqe) { + unsigned tag = user_data_to_tag(cqe->user_data); const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); @@ -257,13 +262,13 @@ static void ublk_stripe_io_done(struct ublk_queue *q, int tag, } } - if (ublk_completed_tgt_io(q, tag)) { + if (ublk_completed_tgt_io(t, q, tag)) { int res = io->result; if (!res) res = iod->nr_sectors << 9; - ublk_complete_io(q, tag, res); + ublk_complete_io(t, q, tag, res); free_stripe_array(io->private_data); io->private_data = NULL; diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h new file mode 100644 index 000000000000..36545d1567f1 --- /dev/null +++ b/tools/testing/selftests/ublk/utils.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef KUBLK_UTILS_H +#define KUBLK_UTILS_H + +#define __maybe_unused __attribute__((unused)) + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + unsigned long __mptr = (unsigned long)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); }) +#endif + +#define round_up(val, rnd) \ + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) + +static inline unsigned int ilog2(unsigned int x) +{ + if (x == 0) + return 0; + return (sizeof(x) * 8 - 1) - __builtin_clz(x); +} + +#define UBLK_DBG_DEV (1U << 0) +#define UBLK_DBG_THREAD (1U << 1) +#define UBLK_DBG_IO_CMD (1U << 2) +#define UBLK_DBG_IO (1U << 3) +#define UBLK_DBG_CTRL_CMD (1U << 4) +#define UBLK_LOG (1U << 5) + +extern unsigned int ublk_dbg_mask; + +static inline void ublk_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); +} + +static inline void ublk_log(const char *fmt, ...) +{ + if (ublk_dbg_mask & UBLK_LOG) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline void ublk_dbg(int level, const char *fmt, ...) +{ + if (level & ublk_dbg_mask) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +#endif |