diff options
1676 files changed, 47016 insertions, 18734 deletions
@@ -206,6 +206,7 @@ Danilo Krummrich <dakr@kernel.org> <dakr@redhat.com> David Brownell <david-b@pacbell.net> David Collins <quic_collinsd@quicinc.com> <collinsd@codeaurora.org> David Heidelberg <david@ixit.cz> <d.okias@gmail.com> +David Hildenbrand <david@kernel.org> <david@redhat.com> David Rheinsberg <david@readahead.eu> <dh.herrmann@gmail.com> David Rheinsberg <david@readahead.eu> <dh.herrmann@googlemail.com> David Rheinsberg <david@readahead.eu> <david.rheinsberg@gmail.com> @@ -426,7 +427,7 @@ Kenneth W Chen <kenneth.w.chen@intel.com> Kenneth Westfield <quic_kwestfie@quicinc.com> <kwestfie@codeaurora.org> Kiran Gunda <quic_kgunda@quicinc.com> <kgunda@codeaurora.org> Kirill Tkhai <tkhai@ya.ru> <ktkhai@virtuozzo.com> -Kirill A. Shutemov <kas@kernel.org> <kirill.shutemov@linux.intel.com> +Kiryl Shutsemau <kas@kernel.org> <kirill.shutemov@linux.intel.com> Kishon Vijay Abraham I <kishon@kernel.org> <kishon@ti.com> Konrad Dybcio <konradybcio@kernel.org> <konrad.dybcio@linaro.org> Konrad Dybcio <konradybcio@kernel.org> <konrad.dybcio@somainline.org> @@ -437,6 +438,7 @@ Krishna Manikandan <quic_mkrishn@quicinc.com> <mkrishn@codeaurora.org> Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski.k@gmail.com> Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski@samsung.com> Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@canonical.com> +Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@linaro.org> Krzysztof WilczyÅ„ski <kwilczynski@kernel.org> <krzysztof.wilczynski@linux.com> Krzysztof WilczyÅ„ski <kwilczynski@kernel.org> <kw@linux.com> Kshitiz Godara <quic_kgodara@quicinc.com> <kgodara@codeaurora.org> @@ -605,7 +607,8 @@ Oleksij Rempel <o.rempel@pengutronix.de> Oleksij Rempel <o.rempel@pengutronix.de> <ore@pengutronix.de> Oliver Hartkopp <socketcan@hartkopp.net> <oliver.hartkopp@volkswagen.de> Oliver Hartkopp <socketcan@hartkopp.net> <oliver@hartkopp.net> -Oliver Upton <oliver.upton@linux.dev> <oupton@google.com> +Oliver Upton <oupton@kernel.org> <oupton@google.com> +Oliver Upton <oupton@kernel.org> <oliver.upton@linux.dev> OndÅ™ej Jirman <megi@xff.cz> <megous@megous.com> Oza Pawandeep <quic_poza@quicinc.com> <poza@codeaurora.org> Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com> @@ -688,6 +691,8 @@ Sachin Mokashi <sachin.mokashi@intel.com> <sachinx.mokashi@intel.com> Sachin P Sant <ssant@in.ibm.com> Sai Prakash Ranjan <quic_saipraka@quicinc.com> <saiprakash.ranjan@codeaurora.org> Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi> +Sam Protsenko <semen.protsenko@linaro.org> +Sam Protsenko <semen.protsenko@linaro.org> <semen.protsenko@globallogic.com> Sam Ravnborg <sam@mars.ravnborg.org> Sankeerth Billakanti <quic_sbillaka@quicinc.com> <sbillaka@codeaurora.org> Santosh Shilimkar <santosh.shilimkar@oracle.org> diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst index 447bfde509fb..5a45db32c49b 100644 --- a/Documentation/admin-guide/RAS/main.rst +++ b/Documentation/admin-guide/RAS/main.rst @@ -406,24 +406,8 @@ index of the MC:: |->mc2 .... -Under each ``mcX`` directory each ``csrowX`` is again represented by a -``csrowX``, where ``X`` is the csrow index:: - - .../mc/mc0/ - | - |->csrow0 - |->csrow2 - |->csrow3 - .... - -Notice that there is no csrow1, which indicates that csrow0 is composed -of a single ranked DIMMs. This should also apply in both Channels, in -order to have dual-channel mode be operational. Since both csrow2 and -csrow3 are populated, this indicates a dual ranked set of DIMMs for -channels 0 and 1. - -Within each of the ``mcX`` and ``csrowX`` directories are several EDAC -control and attribute files. +Within each of the ``mcX`` directory are several EDAC control and +attribute files. ``mcX`` directories ------------------- @@ -569,7 +553,7 @@ this ``X`` memory module: - Unbuffered-DDR .. [#f5] On some systems, the memory controller doesn't have any logic - to identify the memory module. On such systems, the directory is called ``rankX`` and works on a similar way as the ``csrowX`` directories. + to identify the memory module. On such systems, the directory is called ``rankX``. On modern Intel memory controllers, the memory controller identifies the memory modules directly. On such systems, the directory is called ``dimmX``. @@ -577,126 +561,6 @@ this ``X`` memory module: symlinks inside the sysfs mapping that are automatically created by the sysfs subsystem. Currently, they serve no purpose. -``csrowX`` directories ----------------------- - -When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the ``csrowX`` -directories. As this API doesn't work properly for Rambus, FB-DIMMs and -modern Intel Memory Controllers, this is being deprecated in favor of -``dimmX`` directories. - -In the ``csrowX`` directories are EDAC control and attribute files for -this ``X`` instance of csrow: - - -- ``ue_count`` - Total Uncorrectable Errors count attribute file - - This attribute file displays the total count of uncorrectable - errors that have occurred on this csrow. If panic_on_ue is set - this counter will not have a chance to increment, since EDAC - will panic the system. - - -- ``ce_count`` - Total Correctable Errors count attribute file - - This attribute file displays the total count of correctable - errors that have occurred on this csrow. This count is very - important to examine. CEs provide early indications that a - DIMM is beginning to fail. This count field should be - monitored for non-zero values and report such information - to the system administrator. - - -- ``size_mb`` - Total memory managed by this csrow attribute file - - This attribute file displays, in count of megabytes, the memory - that this csrow contains. - - -- ``mem_type`` - Memory Type attribute file - - This attribute file will display what type of memory is currently - on this csrow. Normally, either buffered or unbuffered memory. - Examples: - - - Registered-DDR - - Unbuffered-DDR - - -- ``edac_mode`` - EDAC Mode of operation attribute file - - This attribute file will display what type of Error detection - and correction is being utilized. - - -- ``dev_type`` - Device type attribute file - - This attribute file will display what type of DRAM device is - being utilized on this DIMM. - Examples: - - - x1 - - x2 - - x4 - - x8 - - -- ``ch0_ce_count`` - Channel 0 CE Count attribute file - - This attribute file will display the count of CEs on this - DIMM located in channel 0. - - -- ``ch0_ue_count`` - Channel 0 UE Count attribute file - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch0_dimm_label`` - Channel 0 DIMM Label control file - - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - - -- ``ch1_ce_count`` - Channel 1 CE Count attribute file - - - This attribute file will display the count of CEs on this - DIMM located in channel 1. - - -- ``ch1_ue_count`` - Channel 1 UE Count attribute file - - - This attribute file will display the count of UEs on this - DIMM located in channel 0. - - -- ``ch1_dimm_label`` - Channel 1 DIMM Label control file - - This control file allows this DIMM to have a label assigned - to it. With this label in the module, when errors occur - the output can provide the DIMM label in the system log. - This becomes vital for panic events to isolate the - cause of the UE event. - - DIMM Labels must be assigned after booting, with information - that correctly identifies the physical slot with its - silk screen label. This information is currently very - motherboard specific and determination of this information - must occur in userland at this time. - System Logging -------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..8c5636a120ee 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6207,7 +6207,7 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec, abmc. + mba, smba, bmec, abmc, sdciae. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba @@ -6500,6 +6500,10 @@ Memory area to be used by remote processor image, managed by CMA. + rseq_debug= [KNL] Enable or disable restartable sequence + debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE. + Format: <bool> + rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling when CONFIG_RT_GROUP_SCHED=y. Defaults to !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED. diff --git a/Documentation/devicetree/bindings/gpio/ti,twl4030-gpio.yaml b/Documentation/devicetree/bindings/gpio/ti,twl4030-gpio.yaml index 5e3e199fd9a4..96d50d14c071 100644 --- a/Documentation/devicetree/bindings/gpio/ti,twl4030-gpio.yaml +++ b/Documentation/devicetree/bindings/gpio/ti,twl4030-gpio.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) %YAML 1.2 --- -$id: http://devicetree.org/schemas/ti,twl4030-gpio.yaml# +$id: http://devicetree.org/schemas/gpio/ti,twl4030-gpio.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: TI TWL4030 GPIO controller diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml index 3d60d9e9e208..d0fad930de9d 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.yaml @@ -39,6 +39,9 @@ properties: - amlogic,a4-gpio-ao-intc - amlogic,a5-gpio-intc - amlogic,c3-gpio-intc + - amlogic,s6-gpio-intc + - amlogic,s7-gpio-intc + - amlogic,s7d-gpio-intc - amlogic,t7-gpio-intc - const: amlogic,meson-gpio-intc diff --git a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2700-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2700-intc.yaml index 55636d06a674..999df5b905c5 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2700-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/aspeed,ast2700-intc.yaml @@ -25,13 +25,14 @@ properties: interrupt-controller: true '#interrupt-cells': - const: 2 + const: 1 description: The first cell is the IRQ number, the second cell is the trigger type as defined in interrupt.txt in this directory. interrupts: - maxItems: 6 + minItems: 1 + maxItems: 10 description: | Depend to which INTC0 or INTC1 used. INTC0 and INTC1 are two kinds of interrupt controller with enable and raw @@ -74,13 +75,17 @@ examples: interrupt-controller@12101b00 { compatible = "aspeed,ast2700-intc-ic"; reg = <0 0x12101b00 0 0x10>; - #interrupt-cells = <2>; + #interrupt-cells = <1>; interrupt-controller; interrupts = <GIC_SPI 192 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 193 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 194 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 195 IRQ_TYPE_LEVEL_HIGH>, <GIC_SPI 196 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>; + <GIC_SPI 197 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 198 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 199 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 200 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 201 IRQ_TYPE_LEVEL_HIGH>; }; }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml index f683d696909b..6fdb7ae9e85a 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/sifive,plic-1.0.0.yaml @@ -58,6 +58,7 @@ properties: - const: andestech,nceplic100 - items: - enum: + - anlogic,dr1v90-plic - canaan,k210-plic - eswin,eic7700-plic - sifive,fu540-c000-plic @@ -76,6 +77,9 @@ properties: - thead,th1520-plic - const: thead,c900-plic - items: + - const: ultrarisc,dp1000-plic + - const: ultrarisc,cp100-plic + - items: - const: sifive,plic-1.0.0 - const: riscv,plic0 deprecated: true diff --git a/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-mswi.yaml b/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-mswi.yaml index d6fb08a54167..62fd220e126e 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-mswi.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-mswi.yaml @@ -4,18 +4,23 @@ $id: http://devicetree.org/schemas/interrupt-controller/thead,c900-aclint-mswi.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Sophgo sg2042 CLINT Machine-level Software Interrupt Device +title: ACLINT Machine-level Software Interrupt Device maintainers: - Inochi Amaoto <inochiama@outlook.com> properties: compatible: - items: - - enum: - - sophgo,sg2042-aclint-mswi - - sophgo,sg2044-aclint-mswi - - const: thead,c900-aclint-mswi + oneOf: + - items: + - enum: + - sophgo,sg2042-aclint-mswi + - sophgo,sg2044-aclint-mswi + - const: thead,c900-aclint-mswi + - items: + - enum: + - anlogic,dr1v90-aclint-mswi + - const: nuclei,ux900-aclint-mswi reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-sswi.yaml b/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-sswi.yaml index c1ab865fcd64..d02c6886283a 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-sswi.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/thead,c900-aclint-sswi.yaml @@ -30,6 +30,10 @@ properties: - const: thead,c900-aclint-sswi - items: - const: mips,p8700-aclint-sswi + - items: + - enum: + - anlogic,dr1v90-aclint-sswi + - const: nuclei,ux900-aclint-sswi reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml index 19d47fd414bc..ce04d2eadec9 100644 --- a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml @@ -50,18 +50,20 @@ patternProperties: groups: description: Name of the pin group to use for the functions. - $ref: /schemas/types.yaml#/definitions/string - enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp, - i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp, - spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp, - spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp, - uart0_grp, uart1_grp, uart2_grp, uart3_grp, - pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp, - pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp, - pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp, - pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp, - pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp, - pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp] + items: + enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp, + i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp, + spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp, + spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp, + uart0_grp, uart1_grp, uart2_grp, uart3_grp, + pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp, + pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp, + pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp, + pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp, + pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp, + pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp] + minItems: 1 + maxItems: 8 drive-strength: enum: [2, 4, 6, 8, 16, 24, 32] diff --git a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml index 55ece6a8be5e..81e2164ea98f 100644 --- a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml +++ b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml @@ -74,6 +74,7 @@ patternProperties: '^conf': type: object + unevaluatedProperties: false description: Pinctrl node's client devices use subnodes for pin configurations, which in turn use the standard properties below. diff --git a/Documentation/devicetree/bindings/timer/realtek,rtd1625-systimer.yaml b/Documentation/devicetree/bindings/timer/realtek,rtd1625-systimer.yaml new file mode 100644 index 000000000000..e08d3d2d306b --- /dev/null +++ b/Documentation/devicetree/bindings/timer/realtek,rtd1625-systimer.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/realtek,rtd1625-systimer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Realtek System Timer + +maintainers: + - Hao-Wen Ting <haowen.ting@realtek.com> + +description: + The Realtek SYSTIMER (System Timer) is a 64-bit global hardware counter operating + at a fixed 1MHz frequency. Thanks to its compare match interrupt capability, + the timer natively supports oneshot mode for tick broadcast functionality. + +properties: + compatible: + oneOf: + - const: realtek,rtd1625-systimer + - items: + - const: realtek,rtd1635-systimer + - const: realtek,rtd1625-systimer + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include <dt-bindings/interrupt-controller/arm-gic.h> + + timer@89420 { + compatible = "realtek,rtd1635-systimer", + "realtek,rtd1625-systimer"; + reg = <0x89420 0x18>; + interrupts = <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>; + }; diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index f1d1882009ba..647746e6f75f 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1705,6 +1705,8 @@ patternProperties: description: Universal Scientific Industrial Co., Ltd. "^usr,.*": description: U.S. Robotics Corporation + "^ultrarisc,.*": + description: UltraRISC Technology Co., Ltd. "^ultratronik,.*": description: Ultratronik GmbH "^utoo,.*": diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 387fd9cc72ca..da982ca7e413 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -135,6 +135,27 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. +``struct iomap_read_ops`` +-------------------------- + +.. code-block:: c + + struct iomap_read_ops { + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + void (*submit_read)(struct iomap_read_folio_ctx *ctx); + }; + +iomap calls these functions: + + - ``read_folio_range``: Called to read in the range. This must be provided + by the caller. If this succeeds, iomap_finish_folio_read() must be called + after the range is read in, regardless of whether the read succeeded or + failed. + + - ``submit_read``: Submit any pending read requests. This function is + optional. + Internal per-Folio State ------------------------ @@ -182,6 +203,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero. The pagecache takes whatever locks it needs before calling the filesystem. +Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct +iomap_read_folio_ctx``: + +.. code-block:: c + + struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; + }; + +``iomap_readahead`` must set: + * ``ops->read_folio_range()`` and ``rac`` + +``iomap_read_folio`` must set: + * ``ops->read_folio_range()`` and ``cur_folio`` + +``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to +pass in any custom data the caller needs accessible in the ops callbacks for +fulfilling reads. + Buffered Writes --------------- @@ -317,6 +360,9 @@ The fields are as follows: delalloc reservations to avoid having delalloc reservations for clean pagecache. This function must be supplied by the filesystem. + If this succeeds, iomap_finish_folio_write() must be called once writeback + completes for the range, regardless of whether the writeback succeeded or + failed. - ``writeback_submit``: Submit the previous built writeback context. Block based file systems should use the iomap_ioend_writeback_submit @@ -444,10 +490,6 @@ These ``struct kiocb`` flags are significant for direct I/O with iomap: Only meaningful for asynchronous I/O, and only if the entire I/O can be issued as a single ``struct bio``. - * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's - process context. - See ``linux/fs.h`` for more details. - Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and ``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open`` function for the file. diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 7233b04668fc..d33429294252 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -211,7 +211,7 @@ test and set for you. e.g.:: inode = iget_locked(sb, ino); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { err = read_inode_from_disk(inode); if (err < 0) { iget_failed(inode); @@ -1309,3 +1309,16 @@ a different length, use vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) instead. + +--- + +**mandatory** + +vfs_mkdir() now returns a dentry - the one returned by ->mkdir(). If +that dentry is different from the dentry passed in, including if it is +an IS_ERR() dentry pointer, the original dentry is dput(). + +When vfs_mkdir() returns an error, and so both dputs() the original +dentry and doesn't provide a replacement, it also unlocks the parent. +Consequently the return value from vfs_mkdir() can be passed to +end_creating() and the parent will be unlocked precisely when necessary. diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b7f35b07876a..8c8ce678148a 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -17,17 +17,18 @@ AMD refers to this feature as AMD Platform Quality of Service(AMD QoS). This feature is enabled by the CONFIG_X86_CPU_RESCTRL and the x86 /proc/cpuinfo flag bits: -=============================================== ================================ -RDT (Resource Director Technology) Allocation "rdt_a" -CAT (Cache Allocation Technology) "cat_l3", "cat_l2" -CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2" -CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc" -MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" -MBA (Memory Bandwidth Allocation) "mba" -SMBA (Slow Memory Bandwidth Allocation) "" -BMEC (Bandwidth Monitoring Event Configuration) "" -ABMC (Assignable Bandwidth Monitoring Counters) "" -=============================================== ================================ +=============================================================== ================================ +RDT (Resource Director Technology) Allocation "rdt_a" +CAT (Cache Allocation Technology) "cat_l3", "cat_l2" +CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2" +CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc" +MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" +MBA (Memory Bandwidth Allocation) "mba" +SMBA (Slow Memory Bandwidth Allocation) "" +BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" +SDCIAE (Smart Data Cache Injection Allocation Enforcement) "" +=============================================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This resulted in the feature flags becoming hard to parse by humans. Adding a new @@ -72,6 +73,11 @@ The 'info' directory contains information about the enabled resources. Each resource has its own subdirectory. The subdirectory names reflect the resource names. +Most of the files in the resource's subdirectory are read-only, and +describe properties of the resource. Resources that support global +configuration options also include writable files that can be used +to modify those settings. + Each subdirectory contains the following files with respect to allocation: @@ -90,12 +96,19 @@ related to allocation: must be set when writing a mask. "shareable_bits": - Bitmask of shareable resource with other executing - entities (e.g. I/O). User can use this when - setting up exclusive cache partitions. Note that - some platforms support devices that have their - own settings for cache use which can over-ride - these bits. + Bitmask of shareable resource with other executing entities + (e.g. I/O). Applies to all instances of this resource. User + can use this when setting up exclusive cache partitions. + Note that some platforms support devices that have their + own settings for cache use which can over-ride these bits. + + When "io_alloc" is enabled, a portion of each cache instance can + be configured for shared use between hardware and software. + "bit_usage" should be used to see which portions of each cache + instance is configured for hardware use via "io_alloc" feature + because every cache instance can have its "io_alloc" bitmask + configured independently via "io_alloc_cbm". + "bit_usage": Annotated capacity bitmasks showing how all instances of the resource are used. The legend is: @@ -109,16 +122,16 @@ related to allocation: "H": Corresponding region is used by hardware only but available for software use. If a resource - has bits set in "shareable_bits" but not all - of these bits appear in the resource groups' - schematas then the bits appearing in - "shareable_bits" but no resource group will - be marked as "H". + has bits set in "shareable_bits" or "io_alloc_cbm" + but not all of these bits appear in the resource + groups' schemata then the bits appearing in + "shareable_bits" or "io_alloc_cbm" but no + resource group will be marked as "H". "X": Corresponding region is available for sharing and - used by hardware and software. These are the - bits that appear in "shareable_bits" as - well as a resource group's allocation. + used by hardware and software. These are the bits + that appear in "shareable_bits" or "io_alloc_cbm" + as well as a resource group's allocation. "S": Corresponding region is used by software and available for sharing. @@ -136,6 +149,77 @@ related to allocation: "1": Non-contiguous 1s value in CBM is supported. +"io_alloc": + "io_alloc" enables system software to configure the portion of + the cache allocated for I/O traffic. File may only exist if the + system supports this feature on some of its cache resources. + + "disabled": + Resource supports "io_alloc" but the feature is disabled. + Portions of cache used for allocation of I/O traffic cannot + be configured. + "enabled": + Portions of cache used for allocation of I/O traffic + can be configured using "io_alloc_cbm". + "not supported": + Support not available for this resource. + + The feature can be modified by writing to the interface, for example: + + To enable:: + + # echo 1 > /sys/fs/resctrl/info/L3/io_alloc + + To disable:: + + # echo 0 > /sys/fs/resctrl/info/L3/io_alloc + + The underlying implementation may reduce resources available to + general (CPU) cache allocation. See architecture specific notes + below. Depending on usage requirements the feature can be enabled + or disabled. + + On AMD systems, io_alloc feature is supported by the L3 Smart + Data Cache Injection Allocation Enforcement (SDCIAE). The CLOSID for + io_alloc is the highest CLOSID supported by the resource. When + io_alloc is enabled, the highest CLOSID is dedicated to io_alloc and + no longer available for general (CPU) cache allocation. When CDP is + enabled, io_alloc routes I/O traffic using the highest CLOSID allocated + for the instruction cache (CDP_CODE), making this CLOSID no longer + available for general (CPU) cache allocation for both the CDP_CODE + and CDP_DATA resources. + +"io_alloc_cbm": + Capacity bitmasks that describe the portions of cache instances to + which I/O traffic from supported I/O devices are routed when "io_alloc" + is enabled. + + CBMs are displayed in the following format: + + <cache_id0>=<cbm>;<cache_id1>=<cbm>;... + + Example:: + + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=ffff;1=ffff + + CBMs can be configured by writing to the interface. + + Example:: + + # echo 1=ff > /sys/fs/resctrl/info/L3/io_alloc_cbm + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=ffff;1=00ff + + # echo "0=ff;1=f" > /sys/fs/resctrl/info/L3/io_alloc_cbm + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=00ff;1=000f + + When CDP is enabled "io_alloc_cbm" associated with the CDP_DATA and CDP_CODE + resources may reflect the same values. For example, values read from and + written to /sys/fs/resctrl/info/L3DATA/io_alloc_cbm may be reflected by + /sys/fs/resctrl/info/L3CODE/io_alloc_cbm and vice versa. + Memory bandwidth(MB) subdirectory contains the following files with respect to allocation: diff --git a/Documentation/firmware-guide/acpi/i2c-muxes.rst b/Documentation/firmware-guide/acpi/i2c-muxes.rst index f366539acd79..96ef4012d78f 100644 --- a/Documentation/firmware-guide/acpi/i2c-muxes.rst +++ b/Documentation/firmware-guide/acpi/i2c-muxes.rst @@ -37,8 +37,8 @@ which corresponds to the following ASL (in the scope of \_SB):: Name (_HID, ...) Name (_CRS, ResourceTemplate () { I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, - AddressingMode7Bit, "\\_SB.SMB1.CH00", 0x00, - ResourceConsumer,,) + AddressingMode7Bit, "\\_SB.SMB1.MUX0.CH00", + 0x00, ResourceConsumer,,) } } } @@ -52,8 +52,8 @@ which corresponds to the following ASL (in the scope of \_SB):: Name (_HID, ...) Name (_CRS, ResourceTemplate () { I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED, - AddressingMode7Bit, "\\_SB.SMB1.CH01", 0x00, - ResourceConsumer,,) + AddressingMode7Bit, "\\_SB.SMB1.MUX0.CH01", + 0x00, ResourceConsumer,,) } } } diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index 1ead9bb8d9c6..4424cbff251f 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -400,19 +400,30 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz). All other axes retain their meaning. A device must not mix regular directional axes and accelerometer axes on the same event node. -INPUT_PROP_HAPTIC_TOUCHPAD --------------------------- +INPUT_PROP_PRESSUREPAD +---------------------- + +The INPUT_PROP_PRESSUREPAD property indicates that the device provides +simulated haptic feedback (e.g. a vibrator motor situated below the surface) +instead of physical haptic feedback (e.g. a hinge). This property is only set +if the device: -The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device: -- supports simple haptic auto and manual triggering - can differentiate between at least 5 fingers - uses correct resolution for the X/Y (units and value) -- reports correct force per touch, and correct units for them (newtons or grams) - follows the MT protocol type B +If the simulated haptic feedback is controllable by userspace the device must: + +- support simple haptic auto and manual triggering, and +- report correct force per touch, and correct units for them (newtons or grams), and +- provide the EV_FF FF_HAPTIC force feedback effect. + Summing up, such devices follow the MS spec for input devices in -Win8 and Win8.1, and in addition support the Simple haptic controller HID table, -and report correct units for the pressure. +Win8 and Win8.1, and in addition may support the Simple haptic controller HID +table, and report correct units for the pressure. + +Where applicable, this property is set in addition to INPUT_PROP_BUTTONPAD, it +does not replace that property. Guidelines ========== diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst index 3fb7ea3ab22a..9899871d3d9a 100644 --- a/Documentation/locking/seqlock.rst +++ b/Documentation/locking/seqlock.rst @@ -220,13 +220,14 @@ Read path, three categories: according to a passed marker. This is used to avoid lockless readers starvation (too much retry loops) in case of a sharp spike in write activity. First, a lockless read is tried (even marker passed). If - that trial fails (odd sequence counter is returned, which is used as - the next iteration marker), the lockless read is transformed to a - full locking read and no retry loop is necessary:: + that trial fails (sequence counter doesn't match), make the marker + odd for the next iteration, the lockless read is transformed to a + full locking read and no retry loop is necessary, for example:: /* marker; even initialization */ - int seq = 0; + int seq = 1; do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ read_seqbegin_or_lock(&foo_seqlock, &seq); /* ... [[read-side critical section]] ... */ diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index 57d1964453e1..d5363b08f515 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -105,10 +105,10 @@ In this example the SSID is 10280c63. The format of the firmware file names is: -SoundWire (except CS35L56 Rev B0): +SoundWire: cs35lxx-b0-dsp1-misc-SSID[-spkidX]-l?u? -SoundWire CS35L56 Rev B0: +SoundWire CS35L56 Rev B0 firmware released before kernel version 6.16: cs35lxx-b0-dsp1-misc-SSID[-spkidX]-ampN Non-SoundWire (HDA and I2S): @@ -127,9 +127,8 @@ Where: * spkidX is an optional part, used for laptops that have firmware configurations for different makes and models of internal speakers. -The CS35L56 Rev B0 continues to use the old filename scheme because a -large number of firmware files have already been published with these -names. +Early firmware for CS35L56 Rev B0 used the ALSA prefix (ampN) as the +filename qualifier. Support for the l?u? qualifier was added in kernel 6.16. Sound Open Firmware and ALSA topology files ------------------------------------------- diff --git a/Documentation/userspace-api/netlink/intro-specs.rst b/Documentation/userspace-api/netlink/intro-specs.rst index a4435ae4628d..e5ebc617754a 100644 --- a/Documentation/userspace-api/netlink/intro-specs.rst +++ b/Documentation/userspace-api/netlink/intro-specs.rst @@ -13,10 +13,10 @@ Simple CLI Kernel comes with a simple CLI tool which should be useful when developing Netlink related code. The tool is implemented in Python and can use a YAML specification to issue Netlink requests -to the kernel. Only Generic Netlink is supported. +to the kernel. The tool is located at ``tools/net/ynl/pyynl/cli.py``. It accepts -a handul of arguments, the most important ones are: +a handful of arguments, the most important ones are: - ``--spec`` - point to the spec file - ``--do $name`` / ``--dump $name`` - issue request ``$name`` diff --git a/Documentation/wmi/driver-development-guide.rst b/Documentation/wmi/driver-development-guide.rst index 99ef21fc1c1e..5680303ae314 100644 --- a/Documentation/wmi/driver-development-guide.rst +++ b/Documentation/wmi/driver-development-guide.rst @@ -54,6 +54,7 @@ to matching WMI devices using a struct wmi_device_id table: :: static const struct wmi_device_id foo_id_table[] = { + /* Only use uppercase letters! */ { "936DA01F-9ABD-4D9D-80C7-02AF85C822A8", NULL }, { } }; diff --git a/MAINTAINERS b/MAINTAINERS index 46bd8e033042..e8ad048bf35d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -915,6 +915,7 @@ F: drivers/staging/media/sunxi/cedrus/ ALPHA PORT M: Richard Henderson <richard.henderson@linaro.org> M: Matt Turner <mattst88@gmail.com> +M: Magnus Lindholm <linmag7@gmail.com> L: linux-alpha@vger.kernel.org S: Odd Fixes F: arch/alpha/ @@ -3925,7 +3926,7 @@ F: crypto/async_tx/ F: include/linux/async_tx.h AT24 EEPROM DRIVER -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-i2c@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -4398,7 +4399,7 @@ BLOCK LAYER M: Jens Axboe <axboe@kernel.dk> L: linux-block@vger.kernel.org S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git F: Documentation/ABI/stable/sysfs-block F: Documentation/block/ F: block/ @@ -4818,6 +4819,7 @@ F: drivers/net/dsa/b53/* F: drivers/net/dsa/bcm_sf2* F: include/linux/dsa/brcm.h F: include/linux/platform_data/b53.h +F: net/dsa/tag_brcm.c BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE M: Florian Fainelli <florian.fainelli@broadcom.com> @@ -9207,6 +9209,7 @@ R: Yue Hu <zbestahu@gmail.com> R: Jeffle Xu <jefflexu@linux.alibaba.com> R: Sandeep Dhavale <dhavale@google.com> R: Hongbo Li <lihongbo22@huawei.com> +R: Chunhai Guo <guochunhai@vivo.com> L: linux-erofs@lists.ozlabs.org S: Maintained W: https://erofs.docs.kernel.org @@ -9263,7 +9266,6 @@ M: Ido Schimmel <idosch@nvidia.com> L: bridge@lists.linux.dev L: netdev@vger.kernel.org S: Maintained -W: http://www.linuxfoundation.org/en/Net:Bridge F: include/linux/if_bridge.h F: include/uapi/linux/if_bridge.h F: include/linux/netfilter_bridge/ @@ -10676,7 +10678,7 @@ F: tools/gpio/gpio-sloppy-logic-analyzer.sh GPIO SUBSYSTEM M: Linus Walleij <linus.walleij@linaro.org> -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-gpio@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -10693,7 +10695,7 @@ K: GPIOD_FLAGS_BIT_NONEXCLUSIVE K: devm_gpiod_unhinge GPIO UAPI -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> R: Kent Gibson <warthog618@gmail.com> L: linux-gpio@vger.kernel.org S: Maintained @@ -11525,7 +11527,7 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song <muchun.song@linux.dev> M: Oscar Salvador <osalvador@suse.de> -R: David Hildenbrand <david@redhat.com> +R: David Hildenbrand <david@kernel.org> L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages @@ -12521,6 +12523,7 @@ F: include/linux/avf/virtchnl.h F: include/linux/net/intel/*/ INTEL ETHERNET PROTOCOL DRIVER FOR RDMA +M: Krzysztof Czurylo <krzysztof.czurylo@intel.com> M: Tatyana Nikolova <tatyana.e.nikolova@intel.com> L: linux-rdma@vger.kernel.org S: Supported @@ -12861,7 +12864,8 @@ F: tools/testing/selftests/sgx/* K: \bSGX_ INTEL SKYLAKE INT3472 ACPI DEVICE DRIVER -M: Daniel Scally <djrscally@gmail.com> +M: Daniel Scally <dan.scally@ideasonboard.com> +M: Sakari Ailus <sakari.ailus@linux.intel.com> S: Maintained F: drivers/platform/x86/intel/int3472/ F: include/linux/platform_data/x86/int3472.h @@ -13656,7 +13660,7 @@ F: virt/kvm/* KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier <maz@kernel.org> -M: Oliver Upton <oliver.upton@linux.dev> +M: Oliver Upton <oupton@kernel.org> R: Joey Gouly <joey.gouly@arm.com> R: Suzuki K Poulose <suzuki.poulose@arm.com> R: Zenghui Yu <yuzenghui@huawei.com> @@ -13730,7 +13734,7 @@ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) M: Christian Borntraeger <borntraeger@linux.ibm.com> M: Janosch Frank <frankja@linux.ibm.com> M: Claudio Imbrenda <imbrenda@linux.ibm.com> -R: David Hildenbrand <david@redhat.com> +R: David Hildenbrand <david@kernel.org> L: kvm@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git @@ -13795,6 +13799,7 @@ F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h F: kernel/kexec_handover.c +F: lib/test_kho.c F: tools/testing/selftests/kho/ KEYS-ENCRYPTED @@ -14454,10 +14459,11 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: include/linux/livepatch.h +F: include/linux/livepatch*.h F: kernel/livepatch/ F: kernel/module/livepatch.c F: samples/livepatch/ +F: scripts/livepatch/ F: tools/testing/selftests/livepatch/ LLC (802.2) @@ -14531,6 +14537,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core F: Documentation/locking/ F: arch/*/include/asm/spinlock*.h +F: include/linux/local_lock*.h F: include/linux/lockdep*.h F: include/linux/mutex*.h F: include/linux/rwlock*.h @@ -15305,7 +15312,7 @@ F: drivers/pwm/pwm-max7360.c F: include/linux/mfd/max7360.h MAXIM MAX77650 PMIC MFD DRIVER -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/*/*max77650.yaml @@ -16201,7 +16208,7 @@ MEMORY CONTROLLER DRIVERS M: Krzysztof Kozlowski <krzk@kernel.org> L: linux-kernel@vger.kernel.org S: Maintained -B: mailto:krzysztof.kozlowski@linaro.org +B: mailto:krzk@kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git F: Documentation/devicetree/bindings/memory-controllers/ F: drivers/memory/ @@ -16217,7 +16224,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git F: drivers/devfreq/tegra30-devfreq.c MEMORY HOT(UN)PLUG -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> M: Oscar Salvador <osalvador@suse.de> L: linux-mm@kvack.org S: Maintained @@ -16242,7 +16249,7 @@ F: tools/mm/ MEMORY MANAGEMENT - CORE M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> R: Liam R. Howlett <Liam.Howlett@oracle.com> R: Vlastimil Babka <vbabka@suse.cz> @@ -16298,7 +16305,7 @@ F: mm/execmem.c MEMORY MANAGEMENT - GUP (GET USER PAGES) M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Jason Gunthorpe <jgg@nvidia.com> R: John Hubbard <jhubbard@nvidia.com> R: Peter Xu <peterx@redhat.com> @@ -16314,7 +16321,7 @@ F: tools/testing/selftests/mm/gup_test.c MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Xu Xin <xu.xin16@zte.com.cn> R: Chengming Zhou <chengming.zhou@linux.dev> L: linux-mm@kvack.org @@ -16330,7 +16337,7 @@ F: mm/mm_slot.h MEMORY MANAGEMENT - MEMORY POLICY AND MIGRATION M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Zi Yan <ziy@nvidia.com> R: Matthew Brost <matthew.brost@intel.com> R: Joshua Hahn <joshua.hahnjy@gmail.com> @@ -16370,7 +16377,7 @@ F: mm/workingset.c MEMORY MANAGEMENT - MISC M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> R: Liam R. Howlett <Liam.Howlett@oracle.com> R: Vlastimil Babka <vbabka@suse.cz> @@ -16458,7 +16465,7 @@ F: mm/shuffle.h MEMORY MANAGEMENT - RECLAIM M: Andrew Morton <akpm@linux-foundation.org> M: Johannes Weiner <hannes@cmpxchg.org> -R: David Hildenbrand <david@redhat.com> +R: David Hildenbrand <david@kernel.org> R: Michal Hocko <mhocko@kernel.org> R: Qi Zheng <zhengqi.arch@bytedance.com> R: Shakeel Butt <shakeel.butt@linux.dev> @@ -16471,7 +16478,7 @@ F: mm/workingset.c MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> R: Rik van Riel <riel@surriel.com> R: Liam R. Howlett <Liam.Howlett@oracle.com> @@ -16495,12 +16502,12 @@ F: mm/secretmem.c MEMORY MANAGEMENT - SWAP M: Andrew Morton <akpm@linux-foundation.org> +M: Chris Li <chrisl@kernel.org> +M: Kairui Song <kasong@tencent.com> R: Kemeng Shi <shikemeng@huaweicloud.com> -R: Kairui Song <kasong@tencent.com> R: Nhat Pham <nphamcs@gmail.com> R: Baoquan He <bhe@redhat.com> R: Barry Song <baohua@kernel.org> -R: Chris Li <chrisl@kernel.org> L: linux-mm@kvack.org S: Maintained F: Documentation/mm/swap-table.rst @@ -16516,7 +16523,7 @@ F: mm/swapfile.c MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) M: Andrew Morton <akpm@linux-foundation.org> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> R: Zi Yan <ziy@nvidia.com> R: Baolin Wang <baolin.wang@linux.alibaba.com> @@ -16618,7 +16625,7 @@ MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton <akpm@linux-foundation.org> M: Liam R. Howlett <Liam.Howlett@oracle.com> M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> R: Vlastimil Babka <vbabka@suse.cz> R: Jann Horn <jannh@google.com> L: linux-mm@kvack.org @@ -18776,6 +18783,10 @@ S: Maintained F: arch/arm/*omap*/*clock* OMAP DEVICE TREE SUPPORT +M: Aaro Koskinen <aaro.koskinen@iki.fi> +M: Andreas Kemnade <andreas@kemnade.info> +M: Kevin Hilman <khilman@baylibre.com> +M: Roger Quadros <rogerq@kernel.org> M: Tony Lindgren <tony@atomide.com> L: linux-omap@vger.kernel.org L: devicetree@vger.kernel.org @@ -19895,7 +19906,7 @@ F: drivers/pci/p2pdma.c F: include/linux/pci-p2pdma.h PCI POWER CONTROL -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-pci@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git @@ -20160,6 +20171,7 @@ R: Alexander Shishkin <alexander.shishkin@linux.intel.com> R: Jiri Olsa <jolsa@kernel.org> R: Ian Rogers <irogers@google.com> R: Adrian Hunter <adrian.hunter@intel.com> +R: James Clark <james.clark@linaro.org> L: linux-perf-users@vger.kernel.org L: linux-kernel@vger.kernel.org S: Supported @@ -20491,7 +20503,7 @@ F: include/linux/powercap.h F: kernel/configs/nopm.config POWER SEQUENCING -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -21173,7 +21185,7 @@ F: Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml F: drivers/i2c/busses/i2c-qcom-cci.c QUALCOMM INTERCONNECT BWMON DRIVER -M: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org> +M: Krzysztof Kozlowski <krzk@kernel.org> L: linux-arm-msm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/interconnect/qcom,msm8998-bwmon.yaml @@ -21294,7 +21306,7 @@ F: Documentation/tee/qtee.rst F: drivers/tee/qcomtee/ QUALCOMM TRUST ZONE MEMORY ALLOCATOR -M: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-arm-msm@vger.kernel.org S: Maintained F: drivers/firmware/qcom/qcom_tzmem.c @@ -21669,6 +21681,11 @@ S: Maintained F: Documentation/devicetree/bindings/spi/realtek,rtl9301-snand.yaml F: drivers/spi/spi-realtek-rtl-snand.c +REALTEK SYSTIMER DRIVER +M: Hao-Wen Ting <haowen.ting@realtek.com> +S: Maintained +F: drivers/clocksource/timer-realtek.c + REALTEK WIRELESS DRIVER (rtlwifi family) M: Ping-Ke Shih <pkshih@realtek.com> L: linux-wireless@vger.kernel.org @@ -22645,7 +22662,7 @@ F: arch/s390/mm S390 NETWORK DRIVERS M: Alexandra Winter <wintera@linux.ibm.com> -R: Aswin Karuvally <aswin@linux.ibm.com> +M: Aswin Karuvally <aswin@linux.ibm.com> L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported @@ -25662,7 +25679,7 @@ F: Documentation/devicetree/bindings/crypto/ti,am62l-dthev2.yaml F: drivers/crypto/ti/ TI DAVINCI MACHINE SUPPORT -M: Bartosz Golaszewski <brgl@bgdev.pl> +M: Bartosz Golaszewski <brgl@kernel.org> L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git @@ -26046,6 +26063,8 @@ S: Supported W: https://www.tq-group.com/en/products/tq-embedded/ F: arch/arm/boot/dts/nxp/imx/*mba*.dts* F: arch/arm/boot/dts/nxp/imx/*tqma*.dts* +F: arch/arm/boot/dts/ti/omap/*mba*.dts* +F: arch/arm/boot/dts/ti/omap/*tqma*.dts* F: arch/arm64/boot/dts/freescale/fsl-*tqml*.dts* F: arch/arm64/boot/dts/freescale/imx*mba*.dts* F: arch/arm64/boot/dts/freescale/imx*tqma*.dts* @@ -27084,7 +27103,7 @@ F: net/vmw_vsock/virtio_transport_common.c VIRTIO BALLOON M: "Michael S. Tsirkin" <mst@redhat.com> -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> L: virtualization@lists.linux.dev S: Maintained F: drivers/virtio/virtio_balloon.c @@ -27112,7 +27131,7 @@ S: Maintained F: drivers/char/virtio_console.c F: include/uapi/linux/virtio_console.h -VIRTIO CORE AND NET DRIVERS +VIRTIO CORE M: "Michael S. Tsirkin" <mst@redhat.com> M: Jason Wang <jasowang@redhat.com> R: Xuan Zhuo <xuanzhuo@linux.alibaba.com> @@ -27125,7 +27144,6 @@ F: Documentation/devicetree/bindings/virtio/ F: Documentation/driver-api/virtio/ F: drivers/block/virtio_blk.c F: drivers/crypto/virtio/ -F: drivers/net/virtio_net.c F: drivers/vdpa/ F: drivers/virtio/ F: include/linux/vdpa.h @@ -27134,7 +27152,6 @@ F: include/linux/vringh.h F: include/uapi/linux/virtio_*.h F: net/vmw_vsock/virtio* F: tools/virtio/ -F: tools/testing/selftests/drivers/net/virtio_net/ VIRTIO CRYPTO DRIVER M: Gonglei <arei.gonglei@huawei.com> @@ -27156,6 +27173,7 @@ F: arch/s390/include/uapi/asm/virtio-ccw.h F: drivers/s390/virtio/ VIRTIO FILE SYSTEM +M: German Maglione <gmaglione@redhat.com> M: Vivek Goyal <vgoyal@redhat.com> M: Stefan Hajnoczi <stefanha@redhat.com> M: Miklos Szeredi <miklos@szeredi.hu> @@ -27239,13 +27257,26 @@ F: drivers/iommu/virtio-iommu.c F: include/uapi/linux/virtio_iommu.h VIRTIO MEM DRIVER -M: David Hildenbrand <david@redhat.com> +M: David Hildenbrand <david@kernel.org> L: virtualization@lists.linux.dev S: Maintained W: https://virtio-mem.gitlab.io/ F: drivers/virtio/virtio_mem.c F: include/uapi/linux/virtio_mem.h +VIRTIO NET DRIVER +M: "Michael S. Tsirkin" <mst@redhat.com> +M: Jason Wang <jasowang@redhat.com> +R: Xuan Zhuo <xuanzhuo@linux.alibaba.com> +R: Eugenio Pérez <eperezma@redhat.com> +L: netdev@vger.kernel.org +L: virtualization@lists.linux.dev +S: Maintained +F: drivers/net/virtio_net.c +F: include/linux/virtio_net.h +F: include/uapi/linux/virtio_net.h +F: tools/testing/selftests/drivers/net/virtio_net/ + VIRTIO PMEM DRIVER M: Pankaj Gupta <pankaj.gupta.linux@gmail.com> L: virtualization@lists.linux.dev @@ -27845,7 +27876,7 @@ F: arch/x86/kernel/stacktrace.c F: arch/x86/kernel/unwind_*.c X86 TRUST DOMAIN EXTENSIONS (TDX) -M: Kirill A. Shutemov <kas@kernel.org> +M: Kiryl Shutsemau <kas@kernel.org> R: Dave Hansen <dave.hansen@linux.intel.com> R: Rick Edgecombe <rick.p.edgecombe@intel.com> L: x86@kernel.org @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* @@ -1061,6 +1061,9 @@ NOSTDINC_FLAGS += -nostdinc # perform bounds checking. KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) +# Allow including a tagged struct or union anonymously in another struct/union. +KBUILD_CFLAGS += -fms-extensions + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/arch/Kconfig b/arch/Kconfig index 74ff01133532..61130b88964b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -917,6 +917,13 @@ config ARCH_USES_CFI_TRAPS An architecture should select this option if it requires the .kcfi_traps section for KCFI trap handling. +config ARCH_USES_CFI_GENERIC_LLVM_PASS + bool + help + An architecture should select this option if it uses the generic + KCFIPass in LLVM to expand kCFI bundles instead of architecture-specific + lowering. + config CFI bool "Use Kernel Control Flow Integrity (kCFI)" default CFI_CLANG diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 16dca28ebf17..3fed97478058 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -509,3 +509,4 @@ 577 common open_tree_attr sys_open_tree_attr 578 common file_getattr sys_file_getattr 579 common file_setattr sys_file_setattr +580 common listns sys_listns diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2e3f93b690f4..4fb985b76e97 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -44,6 +44,8 @@ config ARM select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST + # https://github.com/llvm/llvm-project/commit/d130f402642fba3d065aacb506cb061c899558de + select ARCH_USES_CFI_GENERIC_LLVM_PASS if CLANG_VERSION < 220000 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts index aa9576d8ab56..48ca25f57ef6 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts @@ -1254,3 +1254,17 @@ max-frequency = <25000000>; bus-width = <4>; }; + +/* + * FIXME: rgmii delay is introduced by MAC (configured in u-boot now) + * instead of PCB on fuji board, so the "phy-mode" should be updated to + * "rgmii-[tx|rx]id" when the aspeed-mac driver can handle the delay + * properly. + */ +&mac3 { + status = "okay"; + phy-mode = "rgmii"; + phy-handle = <ðphy3>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii4_default>; +}; diff --git a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts index ac44c745bdf8..a39a021a3910 100644 --- a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts +++ b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts @@ -55,8 +55,8 @@ mdio { /delete-node/ switch@1e; - bcm54210e: ethernet-phy@0 { - reg = <0>; + bcm54210e: ethernet-phy@25 { + reg = <25>; }; }; }; diff --git a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts index 06545a6052f7..43ff5eafb2bb 100644 --- a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts +++ b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts @@ -259,7 +259,7 @@ pinctrl-0 = <&pinctrl_audmux>; status = "okay"; - ssi2 { + mux-ssi2 { fsl,audmux-port = <1>; fsl,port-config = < (IMX_AUDMUX_V2_PTCR_SYN | @@ -271,7 +271,7 @@ >; }; - aud3 { + mux-aud3 { fsl,audmux-port = <2>; fsl,port-config = < IMX_AUDMUX_V2_PTCR_SYN diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi index 6de224dd2bb9..6eb80f867f50 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi @@ -339,7 +339,7 @@ #sound-dai-cells = <0>; compatible = "fsl,imx6ul-sai", "fsl,imx6sx-sai"; reg = <0x02030000 0x4000>; - interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clks IMX6UL_CLK_SAI3_IPG>, <&clks IMX6UL_CLK_SAI3>, <&clks IMX6UL_CLK_DUMMY>, <&clks IMX6UL_CLK_DUMMY>; diff --git a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts index 107b00b9a939..540642e99a41 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts +++ b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts @@ -136,7 +136,7 @@ interrupt-parent = <&gpio2>; interrupts = <8 IRQ_TYPE_EDGE_FALLING>; reset-gpios = <&gpio2 14 GPIO_ACTIVE_LOW>; - report-rate-hz = <6>; + report-rate-hz = <60>; /* settings valid only for Hycon touchscreen */ touchscreen-size-x = <1280>; touchscreen-size-y = <800>; diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index f90be312418e..d6ae80b5df36 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -283,10 +283,17 @@ extern int __put_user_8(void *, unsigned long long); __gu_err; \ }) +/* + * This is a type: either unsigned long, if the argument fits into + * that type, or otherwise unsigned long long. + */ +#define __long_type(x) \ + __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) + #define __get_user_err(x, ptr, err, __t) \ do { \ unsigned long __gu_addr = (unsigned long)(ptr); \ - unsigned long __gu_val; \ + __long_type(x) __gu_val; \ unsigned int __ua_flags; \ __chk_user_ptr(ptr); \ might_fault(); \ @@ -295,6 +302,7 @@ do { \ case 1: __get_user_asm_byte(__gu_val, __gu_addr, err, __t); break; \ case 2: __get_user_asm_half(__gu_val, __gu_addr, err, __t); break; \ case 4: __get_user_asm_word(__gu_val, __gu_addr, err, __t); break; \ + case 8: __get_user_asm_dword(__gu_val, __gu_addr, err, __t); break; \ default: (__gu_val) = __get_user_bad(); \ } \ uaccess_restore(__ua_flags); \ @@ -353,6 +361,22 @@ do { \ #define __get_user_asm_word(x, addr, err, __t) \ __get_user_asm(x, addr, err, "ldr" __t) +#ifdef __ARMEB__ +#define __WORD0_OFFS 4 +#define __WORD1_OFFS 0 +#else +#define __WORD0_OFFS 0 +#define __WORD1_OFFS 4 +#endif + +#define __get_user_asm_dword(x, addr, err, __t) \ + ({ \ + unsigned long __w0, __w1; \ + __get_user_asm(__w0, addr + __WORD0_OFFS, err, "ldr" __t); \ + __get_user_asm(__w1, addr + __WORD1_OFFS, err, "ldr" __t); \ + (x) = ((u64)__w1 << 32) | (u64) __w0; \ +}) + #define __put_user_switch(x, ptr, __err, __fn) \ do { \ const __typeof__(*(ptr)) __user *__pu_ptr = (ptr); \ diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index b07e699aaa3c..fd09afae72a2 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -484,3 +484,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts index b8f256545022..3e0319fdb93f 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts +++ b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts @@ -18,11 +18,21 @@ #include "bcm2712-rpi-5-b-ovl-rp1.dts" +/ { + aliases { + ethernet0 = &rp1_eth; + }; +}; + &pcie2 { #include "rp1-nexus.dtsi" }; &rp1_eth { + assigned-clocks = <&rp1_clocks RP1_CLK_ETH_TSU>, + <&rp1_clocks RP1_CLK_ETH>; + assigned-clock-rates = <50000000>, + <125000000>; status = "okay"; phy-mode = "rgmii-id"; phy-handle = <&phy1>; diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi index 2cf0f7208350..a72b2f1c4a1b 100644 --- a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi @@ -67,7 +67,6 @@ img_subsys: bus@58000000 { power-domains = <&pd IMX_SC_R_CSI_0>; fsl,channel = <0>; fsl,num-irqs = <32>; - status = "disabled"; }; gpio0_mipi_csi0: gpio@58222000 { @@ -144,7 +143,6 @@ img_subsys: bus@58000000 { power-domains = <&pd IMX_SC_R_CSI_1>; fsl,channel = <0>; fsl,num-irqs = <32>; - status = "disabled"; }; gpio0_mipi_csi1: gpio@58242000 { diff --git a/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi index a66ba6d0a8c0..da33a35c6d46 100644 --- a/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi @@ -29,8 +29,8 @@ compatible = "nxp,imx8dxl-dwmac-eqos", "snps,dwmac-5.10a"; reg = <0x5b050000 0x10000>; interrupt-parent = <&gic>; - interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>, - <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>; interrupt-names = "macirq", "eth_wake_irq"; clocks = <&eqos_lpcg IMX_LPCG_CLK_4>, <&eqos_lpcg IMX_LPCG_CLK_6>, diff --git a/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi index ec466e4d7df5..5c0d09c5c086 100644 --- a/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi @@ -54,3 +54,8 @@ interrupt-names = "dma"; }; }; + +&pcieb_ep { + interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "dma"; +}; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts index 614b4ce330b1..0924ac50fd2d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts @@ -16,11 +16,20 @@ ethernet1 = &eqos; }; - extcon_usbc: usbc { - compatible = "linux,extcon-usb-gpio"; + connector { + compatible = "gpio-usb-b-connector", "usb-b-connector"; + id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>; + label = "Type-C"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_usb1_id>; - id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>; + type = "micro"; + vbus-supply = <®_usb1_vbus>; + + port { + usb_dr_connector: endpoint { + remote-endpoint = <&usb3_dwc>; + }; + }; }; leds { @@ -244,9 +253,15 @@ hnp-disable; srp-disable; dr_mode = "otg"; - extcon = <&extcon_usbc>; usb-role-switch; + role-switch-default-mode = "peripheral"; status = "okay"; + + port { + usb3_dwc: endpoint { + remote-endpoint = <&usb_dr_connector>; + }; + }; }; &usb_dwc3_1 { @@ -273,7 +288,6 @@ }; &usb3_phy0 { - vbus-supply = <®_usb1_vbus>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts index 202d5c67ac40..9c0b6b8d6459 100644 --- a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts +++ b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts @@ -217,8 +217,8 @@ compatible = "nxp,cbdtu02043", "gpio-sbu-mux"; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_typec_mux>; - select-gpios = <&lsio_gpio4 6 GPIO_ACTIVE_LOW>; - enable-gpios = <&lsio_gpio4 19 GPIO_ACTIVE_HIGH>; + select-gpios = <&lsio_gpio4 6 GPIO_ACTIVE_HIGH>; + enable-gpios = <&lsio_gpio4 19 GPIO_ACTIVE_LOW>; orientation-switch; port { diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 1292677cbe4e..6da961eb3fe5 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1886,7 +1886,7 @@ assigned-clock-rates = <3600000000>, <100000000>, <10000000>; assigned-clock-parents = <0>, <0>, <&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>; - msi-map = <0x0 &its 0x98 0x1>; + msi-map = <0x0 &its 0x10 0x1>; power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>; status = "disabled"; }; @@ -1963,6 +1963,7 @@ assigned-clock-rates = <3600000000>, <100000000>, <10000000>; assigned-clock-parents = <0>, <0>, <&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>; + msi-map = <0x0 &its 0x98 0x1>; power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi index a410fc335fa3..c0f17f8189fa 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi @@ -42,6 +42,7 @@ interrupt-parent = <&gpio>; interrupts = <TEGRA194_MAIN_GPIO(G, 4) IRQ_TYPE_LEVEL_LOW>; #phy-cells = <0>; + wakeup-source; }; }; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi index 283d9cbc4368..03b7c4313750 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi @@ -598,7 +598,6 @@ pinctrl-2 = <&otp_pin>; resets = <&cru SRST_TSADC>; reset-names = "tsadc-apb"; - rockchip,grf = <&grf>; rockchip,hw-tshut-temp = <100000>; #thermal-sensor-cells = <1>; status = "disabled"; diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi index c4f4f1ff6117..9da6fd82e46b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi @@ -3,7 +3,7 @@ * Copyright (c) 2016-2017 Fuzhou Rockchip Electronics Co., Ltd */ -#include "rk3399.dtsi" +#include "rk3399-base.dtsi" / { cluster0_opp: opp-table-0 { diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso index 5e8f729c2cf2..141a921a06e4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso +++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso @@ -45,11 +45,11 @@ cam_dovdd_1v8: regulator-cam-dovdd-1v8 { compatible = "regulator-fixed"; - gpio = <&pca9670 3 GPIO_ACTIVE_LOW>; - regulator-max-microvolt = <1800000>; - regulator-min-microvolt = <1800000>; - regulator-name = "cam-dovdd-1v8"; - vin-supply = <&vcc1v8_video>; + gpio = <&pca9670 3 GPIO_ACTIVE_LOW>; + regulator-max-microvolt = <1800000>; + regulator-min-microvolt = <1800000>; + regulator-name = "cam-dovdd-1v8"; + vin-supply = <&vcc1v8_video>; }; cam_dvdd_1v2: regulator-cam-dvdd-1v2 { diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi index 7f578c50b4ad..b6cf03a7ba66 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi @@ -120,7 +120,7 @@ compatible = "regulator-fixed"; regulator-name = "vcc3v3_pcie"; enable-active-high; - gpios = <&gpio0 RK_PB1 GPIO_ACTIVE_HIGH>; + gpios = <&gpio4 RK_PB1 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&pcie_drv>; regulator-always-on; @@ -187,7 +187,7 @@ vcc5v0_usb2b: regulator-vcc5v0-usb2b { compatible = "regulator-fixed"; enable-active-high; - gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>; + gpio = <&gpio4 RK_PC4 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&vcc5v0_usb2b_en>; regulator-name = "vcc5v0_usb2b"; @@ -199,7 +199,7 @@ vcc5v0_usb2t: regulator-vcc5v0-usb2t { compatible = "regulator-fixed"; enable-active-high; - gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>; + gpios = <&gpio3 RK_PD5 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&vcc5v0_usb2t_en>; regulator-name = "vcc5v0_usb2t"; diff --git a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi index d0e38412d56a..08bf40de17ea 100644 --- a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi @@ -789,7 +789,7 @@ vccio1-supply = <&vccio_acodec>; vccio2-supply = <&vcc_1v8>; vccio3-supply = <&vccio_sd>; - vccio4-supply = <&vcc_1v8>; + vccio4-supply = <&vcca1v8_pmu>; vccio5-supply = <&vcc_1v8>; vccio6-supply = <&vcc1v8_dvp>; vccio7-supply = <&vcc_3v3>; diff --git a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts index 0f844806ec54..442a2bc43ba8 100644 --- a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts +++ b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts @@ -482,6 +482,8 @@ }; &i2s1_8ch { + pinctrl-names = "default"; + pinctrl-0 = <&i2s1m0_sclktx &i2s1m0_lrcktx &i2s1m0_sdi0 &i2s1m0_sdo0>; rockchip,trcm-sync-tx-only; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi index fc4e9e07f1cf..a86fc6b4e8c4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi @@ -276,12 +276,6 @@ opp-microvolt = <900000 900000 950000>; clock-latency-ns = <40000>; }; - - opp-2208000000 { - opp-hz = /bits/ 64 <2208000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; cluster1_opp_table: opp-table-cluster1 { @@ -348,12 +342,6 @@ opp-microvolt = <925000 925000 950000>; clock-latency-ns = <40000>; }; - - opp-2304000000 { - opp-hz = /bits/ 64 <2304000000>; - opp-microvolt = <950000 950000 950000>; - clock-latency-ns = <40000>; - }; }; gpu_opp_table: opp-table-gpu { @@ -2561,8 +2549,6 @@ interrupts = <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&i2c9m0_xfer>; - resets = <&cru SRST_I2C9>, <&cru SRST_P_I2C9>; - reset-names = "i2c", "apb"; #address-cells = <1>; #size-cells = <0>; status = "disabled"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi index 0f1a77697351..b5d630d2c879 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi @@ -115,7 +115,7 @@ }; }; - gpu_opp_table: opp-table { + gpu_opp_table: opp-table-gpu { compatible = "operating-points-v2"; opp-300000000 { diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi index b44e89e1bb15..365c1d958f2d 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi @@ -382,14 +382,12 @@ cap-mmc-highspeed; mmc-ddr-1_8v; mmc-hs200-1_8v; - mmc-hs400-1_8v; - mmc-hs400-enhanced-strobe; mmc-pwrseq = <&emmc_pwrseq>; no-sdio; no-sd; non-removable; pinctrl-names = "default"; - pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>; + pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk>; vmmc-supply = <&vcc_3v3_s3>; vqmmc-supply = <&vcc_1v8_s3>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi index 9884a5df47df..e1e0e3fc0ca7 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi @@ -66,7 +66,7 @@ }; }; - gpu_opp_table: opp-table { + gpu_opp_table: opp-table-gpu { compatible = "operating-points-v2"; opp-300000000 { diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts index ad6d04793b0a..83b9b6645a1e 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts @@ -14,8 +14,8 @@ gpios = <&gpio0 RK_PC5 GPIO_ACTIVE_HIGH>; regulator-name = "vcc3v3_pcie20"; regulator-boot-on; - regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <1800000>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; startup-delay-us = <50000>; vin-supply = <&vcc5v0_sys>; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index e3a2d37bd104..1a48faad2473 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1341,7 +1341,7 @@ CONFIG_COMMON_CLK_RS9_PCIE=y CONFIG_COMMON_CLK_VC3=y CONFIG_COMMON_CLK_VC5=y CONFIG_COMMON_CLK_BD718XX=m -CONFIG_CLK_RASPBERRYPI=m +CONFIG_CLK_RASPBERRYPI=y CONFIG_CLK_IMX8MM=y CONFIG_CLK_IMX8MN=y CONFIG_CLK_IMX8MP=y diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 00d97b8a757f..51746005239b 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -26,9 +26,12 @@ void __init apply_alternatives_all(void); bool alternative_is_applied(u16 cpucap); #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length); +int apply_alternatives_module(void *start, size_t length); #else -static inline void apply_alternatives_module(void *start, size_t length) { } +static inline int apply_alternatives_module(void *start, size_t length) +{ + return 0; +} #endif void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr, diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 28be048db3f6..bceeaec21fb9 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -19,7 +19,7 @@ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) #define HAVE_ARCH_BUG diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index a81937fae9f6..21dbc9dda747 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -10,8 +10,6 @@ #include <asm/set_memory.h> -static inline bool arch_kfence_init_pool(void) { return true; } - static inline bool kfence_protect_page(unsigned long addr, bool protect) { set_memory_valid(addr, 1, !protect); @@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void) { return !kfence_early_init; } +bool arch_kfence_init_pool(void); #else /* CONFIG_KFENCE */ static inline bool arm64_kfence_can_set_direct_map(void) { return false; } #endif /* CONFIG_KFENCE */ diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 2312e6ee595f..258cca4b4873 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,8 +33,8 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -void tag_clear_highpage(struct page *to); -#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +bool tag_clear_highpages(struct page *to, int numpages); +#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 9abcc8ef3087..b57b2bb00967 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -77,7 +77,7 @@ __percpu_##name##_case_##sz(void *ptr, unsigned long val) \ " stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n" \ " cbnz %w[loop], 1b", \ /* LSE atomics */ \ - #op_lse "\t%" #w "[val], %[ptr]\n" \ + #op_lse "\t%" #w "[val], %" #w "[tmp], %[ptr]\n" \ __nops(3)) \ : [loop] "=&r" (loop), [tmp] "=&r" (tmp), \ [ptr] "+Q"(*(u##sz *)ptr) \ @@ -124,9 +124,16 @@ PERCPU_RW_OPS(8) PERCPU_RW_OPS(16) PERCPU_RW_OPS(32) PERCPU_RW_OPS(64) -PERCPU_OP(add, add, stadd) -PERCPU_OP(andnot, bic, stclr) -PERCPU_OP(or, orr, stset) + +/* + * Use value-returning atomics for CPU-local ops as they are more likely + * to execute "near" to the CPU (e.g. in L1$). + * + * https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop + */ +PERCPU_OP(add, add, ldadd) +PERCPU_OP(andnot, bic, ldclr) +PERCPU_OP(or, orr, ldset) PERCPU_RET_OP(add, add, ldadd) #undef PERCPU_RW_OPS diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index a76f9b387a26..c59f6324f2bb 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -53,7 +53,7 @@ enum { EDYNSCS_INVALID_CFA_OPCODE = 4, }; -int __pi_scs_patch(const u8 eh_frame[], int size); +int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); #endif /* __ASSEMBLY __ */ diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 8fef12626090..900454aaa292 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -117,6 +117,7 @@ void spectre_bhb_patch_wa3(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); void spectre_bhb_patch_clearbhb(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, int nr_inst); +void spectre_print_disabled_mitigations(void); #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 1aa4ecb73429..6490930deef8 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -422,9 +422,9 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt } #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() uaccess_ttbr0_disable() -#define unsafe_put_user(x, ptr, label) \ +#define arch_unsafe_put_user(x, ptr, label) \ __raw_put_mem("sttr", x, uaccess_mask_ptr(ptr), label, U) -#define unsafe_get_user(x, ptr, label) \ +#define arch_unsafe_get_user(x, ptr, label) \ __raw_get_mem("ldtr", x, uaccess_mask_ptr(ptr), label, U) /* diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 7aca29e1d30b..f1cb2447afc9 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -197,8 +197,6 @@ out: */ void __init acpi_boot_table_init(void) { - int ret; - /* * Enable ACPI instead of device tree unless * - ACPI has been disabled explicitly (acpi=off), or @@ -252,12 +250,8 @@ done: * behaviour, use acpi=nospcr to disable console in ACPI SPCR * table as default serial console. */ - ret = acpi_parse_spcr(earlycon_acpi_spcr_enable, + acpi_parse_spcr(earlycon_acpi_spcr_enable, !param_acpi_nospcr); - if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE)) - pr_info("Use ACPI SPCR as default console: No\n"); - else - pr_info("Use ACPI SPCR as default console: Yes\n"); if (IS_ENABLED(CONFIG_ACPI_BGRT)) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); @@ -357,16 +351,6 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) * as long as we take care not to create a writable * mapping for executable code. */ - fallthrough; - - case EFI_ACPI_MEMORY_NVS: - /* - * ACPI NVS marks an area reserved for use by the - * firmware, even after exiting the boot service. - * This may be used by the firmware for sharing dynamic - * tables/data (e.g., ACPI CCEL) with the OS. Map it - * as read-only. - */ prot = PAGE_KERNEL_RO; break; diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 8ff6610af496..f5ec7e7c1d3f 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(const struct alt_region *region, - bool is_module, - unsigned long *cpucap_mask) +static int __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; __le32 *origptr, *updptr; @@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region, updptr = is_module ? origptr : lm_alias(origptr); nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (ALT_HAS_CB(alt)) + if (ALT_HAS_CB(alt)) { alt_cb = ALT_REPL_PTR(alt); - else + if (is_module && !core_kernel_text((unsigned long)alt_cb)) + return -ENOEXEC; + } else { alt_cb = patch_alternative; + } alt_cb(alt, origptr, updptr, nr_inst); @@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region, bitmap_and(applied_alternatives, applied_alternatives, system_cpucaps, ARM64_NCAPS); } + + return 0; } static void __init apply_alternatives_vdso(void) @@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void) } #ifdef CONFIG_MODULES -void apply_alternatives_module(void *start, size_t length) +int apply_alternatives_module(void *start, size_t length) { struct alt_region region = { .begin = start, @@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length) bitmap_fill(all_capabilities, ARM64_NCAPS); - __apply_alternatives(®ion, true, &all_capabilities[0]); + return __apply_alternatives(®ion, true, &all_capabilities[0]); } #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5ed401ff79e3..e25b0f84a22d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -95,6 +95,7 @@ #include <asm/vectors.h> #include <asm/virt.h> +#include <asm/spectre.h> /* Kernel representation of AT_HWCAP and AT_HWCAP2 */ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; @@ -3875,6 +3876,11 @@ static void __init setup_system_capabilities(void) */ if (system_uses_ttbr0_pan()) pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); + + /* + * Report Spectre mitigations status. + */ + spectre_print_disabled_mitigations(); } void __init setup_system_features(void) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index a9c81715ce59..0a97e2621f60 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -100,7 +100,7 @@ static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare(regs); + exit_to_user_mode_prepare_legacy(regs); local_daif_mask(); mte_check_tfsr_exit(); exit_to_user_mode(); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index d6d443c4a01a..24adb581af0e 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -489,16 +489,29 @@ int module_finalize(const Elf_Ehdr *hdr, int ret; s = find_section(hdr, sechdrs, ".altinstructions"); - if (s) - apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (s) { + ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size); + if (ret < 0) { + pr_err("module %s: error occurred when applying alternatives\n", me->name); + return ret; + } + } if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); if (s) { - ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); - if (ret) + /* + * Because we can reject modules that are malformed + * so SCS patching fails, skip dry run and try to patch + * it in place. If patching fails, the module would not + * be loaded anyway. + */ + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true); + if (ret) { pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", me->name, ret); + return -ENOEXEC; + } } } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 43f7a2f39403..32148bf09c1d 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -476,7 +476,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, folio = page_folio(page); if (folio_test_hugetlb(folio)) - WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && + !is_huge_zero_folio(folio)); else WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e8ddbde31a83..659297f87cfa 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -104,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level) if (enable_scs) { scs_patch(__eh_frame_start + va_offset, - __eh_frame_end - __eh_frame_start); + __eh_frame_end - __eh_frame_start, false); asm("ic ialluis"); dynamic_scs_is_enabled = true; diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 55d0cd64ef71..bbe7d30ed12b 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -225,7 +225,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, return 0; } -int scs_patch(const u8 eh_frame[], int size) +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run) { int code_alignment_factor = 1; bool fde_use_sdata8 = false; @@ -277,11 +277,13 @@ int scs_patch(const u8 eh_frame[], int size) } } else { ret = scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, true); + fde_use_sdata8, !skip_dry_run); if (ret) return ret; - scs_handle_fde_frame(frame, code_alignment_factor, - fde_use_sdata8, false); + + if (!skip_dry_run) + scs_handle_fde_frame(frame, code_alignment_factor, + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index 08ef9f80456b..aec3172d4003 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -27,7 +27,7 @@ extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); void relocate_kernel(u64 offset); -int scs_patch(const u8 eh_frame[], int size); +int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run); void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 8ab6104a4883..43a0361a8bf0 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -49,7 +49,10 @@ void *alloc_insn_page(void) addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!addr) return NULL; - set_memory_rox((unsigned long)addr, 1); + if (set_memory_rox((unsigned long)addr, 1)) { + execmem_free(addr); + return NULL; + } return addr; } diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index f9a32dfde006..80a580e019c5 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param); static bool spectre_v2_mitigations_off(void) { - bool ret = __nospectre_v2 || cpu_mitigations_off(); - - if (ret) - pr_info_once("spectre-v2 mitigation disabled by command line option\n"); - - return ret; + return __nospectre_v2 || cpu_mitigations_off(); } static const char *get_bhb_affected_string(enum mitigation_state bhb_state) @@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param); */ static bool spectre_v4_mitigations_off(void) { - bool ret = cpu_mitigations_off() || - __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; - - if (ret) - pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); - - return ret; + return cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; } /* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ @@ -1043,9 +1033,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) { /* No point mitigating Spectre-BHB alone. */ } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) { - pr_info_once("spectre-bhb mitigation disabled by compile time option\n"); - } else if (cpu_mitigations_off() || __nospectre_bhb) { - pr_info_once("spectre-bhb mitigation disabled by command line option\n"); + /* Do nothing */ } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) { state = SPECTRE_MITIGATED; set_bit(BHB_HW, &system_bhb_mitigations); @@ -1199,3 +1187,18 @@ void unpriv_ebpf_notify(int new_state) pr_err("WARNING: %s", EBPF_WARN); } #endif + +void spectre_print_disabled_mitigations(void) +{ + /* Keep a single copy of the common message suffix to avoid duplication. */ + const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n"; + + if (spectre_v2_mitigations_off()) + pr_info("spectre-v2 %s", spectre_disabled_suffix); + + if (spectre_v4_mitigations_off()) + pr_info("spectre-v4 %s", spectre_disabled_suffix); + + if (__nospectre_bhb || cpu_mitigations_off()) + pr_info("spectre-bhb %s", spectre_disabled_suffix); +} diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 68cea3a4a35c..6fb838eee2e7 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1094,7 +1094,7 @@ static void ipi_setup_sgi(int ipi) irq = ipi_irq_base + ipi; if (ipi_should_be_nmi(ipi)) { - err = request_percpu_nmi(irq, ipi_handler, "IPI", &irq_stat); + err = request_percpu_nmi(irq, ipi_handler, "IPI", NULL, &irq_stat); WARN(err, "Could not request IRQ %d as NMI, err=%d\n", irq, err); } else { err = request_percpu_irq(irq, ipi_handler, "IPI", &irq_stat); diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index ffa3536581f6..9d0efed91414 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -63,7 +63,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu11 + -std=gnu11 -fms-extensions VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) @@ -71,6 +71,7 @@ VDSO_CFLAGS += -fno-strict-overflow VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) +VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..052bf0d4d0b0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -624,6 +624,7 @@ nommu: kvm_timer_vcpu_load(vcpu); kvm_vgic_load(vcpu); kvm_vcpu_load_debug(vcpu); + kvm_vcpu_load_fgt(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); @@ -642,7 +643,6 @@ nommu: vcpu->arch.hcr_el2 |= HCR_TWI; vcpu_set_pauth_traps(vcpu); - kvm_vcpu_load_fgt(vcpu); if (is_protected_kvm_enabled()) { kvm_call_hyp_nvhe(__pkvm_vcpu_load, diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 4e16f9b96f63..58b7d0c477d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -479,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id, struct ffa_mem_region_attributes *ep_mem_access; struct ffa_composite_mem_region *reg; struct ffa_mem_region *buf; - u32 offset, nr_ranges; + u32 offset, nr_ranges, checked_offset; int ret = 0; if (addr_mbz || npages_mbz || fraglen > len || @@ -516,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id, goto out_unlock; } - if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < checked_offset) { ret = FFA_RET_INVALID_PARAMETERS; goto out_unlock; } diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ddc8beb55eee..49db32f3ddf7 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -367,6 +367,19 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +/* + * Ensure the PFN range is contained within PA-range. + * + * This check is also robust to overflows and is therefore a requirement before + * using a pfn/nr_pages pair from an untrusted source. + */ +static bool pfn_range_is_valid(u64 pfn, u64 nr_pages) +{ + u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT); + + return pfn < limit && ((limit - pfn) >= nr_pages); +} + struct kvm_mem_range { u64 start; u64 end; @@ -776,6 +789,9 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) void *virt = __hyp_va(phys); int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); @@ -804,6 +820,9 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) u64 virt = (u64)__hyp_va(phys); int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); hyp_lock_component(); @@ -887,6 +906,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); if (!ret) @@ -902,6 +924,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) u64 size = PAGE_SIZE * nr_pages; int ret; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + host_lock_component(); ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); if (!ret) @@ -945,6 +970,9 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu if (prot & ~KVM_PGTABLE_PROT_RWX) return -EINVAL; + if (!pfn_range_is_valid(pfn, nr_pages)) + return -EINVAL; + ret = __guest_check_transition_size(phys, ipa, nr_pages, &size); if (ret) return ret; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e67eb39ddc11..ec3fbe0b8d52 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2595,19 +2595,23 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu, .val = 0, \ } -/* sys_reg_desc initialiser for known cpufeature ID registers */ -#define AA32_ID_SANITISED(name) { \ - ID_DESC(name), \ - .visibility = aa32_id_visibility, \ - .val = 0, \ -} - /* sys_reg_desc initialiser for writable ID registers */ #define ID_WRITABLE(name, mask) { \ ID_DESC(name), \ .val = mask, \ } +/* + * 32bit ID regs are fully writable when the guest is 32bit + * capable. Nothing in the KVM code should rely on 32bit features + * anyway, only 64bit, so let the VMM do its worse. + */ +#define AA32_ID_WRITABLE(name) { \ + ID_DESC(name), \ + .visibility = aa32_id_visibility, \ + .val = GENMASK(31, 0), \ +} + /* sys_reg_desc initialiser for cpufeature ID registers that need filtering */ #define ID_FILTERED(sysreg, name, mask) { \ ID_DESC(sysreg), \ @@ -3128,40 +3132,39 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 mappings of the AArch32 ID registers */ /* CRm=1 */ - AA32_ID_SANITISED(ID_PFR0_EL1), - AA32_ID_SANITISED(ID_PFR1_EL1), + AA32_ID_WRITABLE(ID_PFR0_EL1), + AA32_ID_WRITABLE(ID_PFR1_EL1), { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, .set_user = set_id_dfr0_el1, .visibility = aa32_id_visibility, .reset = read_sanitised_id_dfr0_el1, - .val = ID_DFR0_EL1_PerfMon_MASK | - ID_DFR0_EL1_CopDbg_MASK, }, + .val = GENMASK(31, 0) }, ID_HIDDEN(ID_AFR0_EL1), - AA32_ID_SANITISED(ID_MMFR0_EL1), - AA32_ID_SANITISED(ID_MMFR1_EL1), - AA32_ID_SANITISED(ID_MMFR2_EL1), - AA32_ID_SANITISED(ID_MMFR3_EL1), + AA32_ID_WRITABLE(ID_MMFR0_EL1), + AA32_ID_WRITABLE(ID_MMFR1_EL1), + AA32_ID_WRITABLE(ID_MMFR2_EL1), + AA32_ID_WRITABLE(ID_MMFR3_EL1), /* CRm=2 */ - AA32_ID_SANITISED(ID_ISAR0_EL1), - AA32_ID_SANITISED(ID_ISAR1_EL1), - AA32_ID_SANITISED(ID_ISAR2_EL1), - AA32_ID_SANITISED(ID_ISAR3_EL1), - AA32_ID_SANITISED(ID_ISAR4_EL1), - AA32_ID_SANITISED(ID_ISAR5_EL1), - AA32_ID_SANITISED(ID_MMFR4_EL1), - AA32_ID_SANITISED(ID_ISAR6_EL1), + AA32_ID_WRITABLE(ID_ISAR0_EL1), + AA32_ID_WRITABLE(ID_ISAR1_EL1), + AA32_ID_WRITABLE(ID_ISAR2_EL1), + AA32_ID_WRITABLE(ID_ISAR3_EL1), + AA32_ID_WRITABLE(ID_ISAR4_EL1), + AA32_ID_WRITABLE(ID_ISAR5_EL1), + AA32_ID_WRITABLE(ID_MMFR4_EL1), + AA32_ID_WRITABLE(ID_ISAR6_EL1), /* CRm=3 */ - AA32_ID_SANITISED(MVFR0_EL1), - AA32_ID_SANITISED(MVFR1_EL1), - AA32_ID_SANITISED(MVFR2_EL1), + AA32_ID_WRITABLE(MVFR0_EL1), + AA32_ID_WRITABLE(MVFR1_EL1), + AA32_ID_WRITABLE(MVFR2_EL1), ID_UNALLOCATED(3,3), - AA32_ID_SANITISED(ID_PFR2_EL1), + AA32_ID_WRITABLE(ID_PFR2_EL1), ID_HIDDEN(ID_DFR1_EL1), - AA32_ID_SANITISED(ID_MMFR5_EL1), + AA32_ID_WRITABLE(ID_MMFR5_EL1), ID_UNALLOCATED(3,7), /* AArch64 ID registers */ @@ -5606,11 +5609,17 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) guard(mutex)(&kvm->arch.config_lock); - if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) && - irqchip_in_kernel(kvm) && - kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) { - kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK; - kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK; + /* + * This hacks into the ID registers, so only perform it when the + * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream. + */ + if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) { + u64 val; + + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); + val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); } if (vcpu_has_nv(vcpu)) { diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index 4c1209261b65..bb92853d1fd3 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -64,29 +64,37 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter) static int iter_mark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; - unsigned long intid; int nr_lpis = 0; + xa_lock_irqsave(&dist->lpi_xa, flags); + xa_for_each(&dist->lpi_xa, intid, irq) { if (!vgic_try_get_irq_ref(irq)) continue; - xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); nr_lpis++; } + xa_unlock_irqrestore(&dist->lpi_xa, flags); + return nr_lpis; } static void iter_unmark_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long intid, flags; struct vgic_irq *irq; - unsigned long intid; xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) { - xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_lock_irqsave(&dist->lpi_xa, flags); + __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + + /* vgic_put_irq() expects to be called outside of the xa_lock */ vgic_put_irq(kvm, irq); } } diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 1796b1a22a72..da62edbc1205 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - xa_init(&dist->lpi_xa); + xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ); } /* CREATION */ @@ -71,6 +71,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); int kvm_vgic_create(struct kvm *kvm, u32 type) { struct kvm_vcpu *vcpu; + u64 aa64pfr0, pfr1; unsigned long i; int ret; @@ -161,10 +162,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; + + if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - else + } else { INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); + } + + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index ce3e3ed3f29f..3f1c4b10fed9 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -78,6 +78,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq; + unsigned long flags; int ret; /* In this case there is no put, since we keep the reference. */ @@ -88,7 +89,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (!irq) return ERR_PTR(-ENOMEM); - ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); + ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT); if (ret) { kfree(irq); return ERR_PTR(ret); @@ -103,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, irq->target_vcpu = vcpu; irq->group = 1; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); /* * There could be a race with another vgic_add_lpi(), so we need to @@ -114,21 +115,18 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, /* Someone was faster with adding this LPI, lets use that. */ kfree(irq); irq = oldirq; - - goto out_unlock; + } else { + ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); } - ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0)); + xa_unlock_irqrestore(&dist->lpi_xa, flags); + if (ret) { xa_release(&dist->lpi_xa, intid); kfree(irq); - } - -out_unlock: - xa_unlock(&dist->lpi_xa); - if (ret) return ERR_PTR(ret); + } /* * We "cache" the configuration table entries in our struct vgic_irq's. diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 6fbb4b099855..2f75ef14d339 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -301,7 +301,8 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) return; /* Hide GICv3 sysreg if necessary */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) { + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 || + !irqchip_in_kernel(vcpu->kvm)) { vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); return; diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 6dd5a10081e2..8d20c53faef0 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * kvm->arch.config_lock (mutex) * its->cmd_lock (mutex) * its->its_lock (mutex) - * vgic_dist->lpi_xa.xa_lock + * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled * vgic_cpu->ap_list_lock must be taken with IRQs disabled * vgic_irq->irq_lock must be taken with IRQs disabled * @@ -141,32 +141,39 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { struct vgic_dist *dist = &kvm->arch.vgic; + unsigned long flags; - if (irq->intid >= VGIC_MIN_LPI) - might_lock(&dist->lpi_xa.xa_lock); + /* + * Normally the lock is only taken when the refcount drops to 0. + * Acquire/release it early on lockdep kernels to make locking issues + * in rare release paths a bit more obvious. + */ + if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) { + guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock); + } if (!__vgic_put_irq(kvm, irq)) return; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); vgic_release_lpi_locked(dist, irq); - xa_unlock(&dist->lpi_xa); + xa_unlock_irqrestore(&dist->lpi_xa, flags); } static void vgic_release_deleted_lpis(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - unsigned long intid; + unsigned long flags, intid; struct vgic_irq *irq; - xa_lock(&dist->lpi_xa); + xa_lock_irqsave(&dist->lpi_xa, flags); xa_for_each(&dist->lpi_xa, intid, irq) { if (irq->pending_release) vgic_release_lpi_locked(dist, irq); } - xa_unlock(&dist->lpi_xa); + xa_unlock_irqrestore(&dist->lpi_xa, flags); } void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index d816ff44faff..a193b6a5d1e6 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -967,10 +967,21 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -void tag_clear_highpage(struct page *page) +bool tag_clear_highpages(struct page *page, int numpages) { - /* Newly allocated page, shouldn't have been tagged yet */ - WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); + /* + * Check if MTE is supported and fall back to clear_highpage(). + * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and + * post_alloc_hook() will invoke tag_clear_highpages(). + */ + if (!system_supports_mte()) + return false; + + /* Newly allocated pages, shouldn't have been tagged yet */ + for (int i = 0; i < numpages; i++, page++) { + WARN_ON_ONCE(!try_page_mte_tagging(page)); + mte_zero_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + return true; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b8d37eb037fc..2ba01dc8ef82 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -708,6 +708,30 @@ out: return ret; } +static inline bool force_pte_mapping(void) +{ + const bool bbml2 = system_capabilities_finalized() ? + system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); + + if (debug_pagealloc_enabled()) + return true; + if (bbml2) + return false; + return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); +} + +static inline bool split_leaf_mapping_possible(void) +{ + /* + * !BBML2_NOABORT systems should never run into scenarios where we would + * have to split. So exit early and let calling code detect it and raise + * a warning. + */ + if (!system_supports_bbml2_noabort()) + return false; + return !force_pte_mapping(); +} + static DEFINE_MUTEX(pgtable_split_lock); int split_kernel_leaf_mapping(unsigned long start, unsigned long end) @@ -715,12 +739,11 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) int ret; /* - * !BBML2_NOABORT systems should not be trying to change permissions on - * anything that is not pte-mapped in the first place. Just return early - * and let the permission change code raise a warning if not already - * pte-mapped. + * Exit early if the region is within a pte-mapped area or if we can't + * split. For the latter case, the permission change code will raise a + * warning if not already pte-mapped. */ - if (!system_supports_bbml2_noabort()) + if (!split_leaf_mapping_possible() || is_kfence_address((void *)start)) return 0; /* @@ -758,30 +781,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) return ret; } -static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, + unsigned long next, struct mm_walk *walk) { + gfp_t gfp = *(gfp_t *)walk->private; pud_t pud = pudp_get(pudp); int ret = 0; if (pud_leaf(pud)) - ret = split_pud(pudp, pud, GFP_ATOMIC, false); + ret = split_pud(pudp, pud, gfp, false); return ret; } -static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, + unsigned long next, struct mm_walk *walk) { + gfp_t gfp = *(gfp_t *)walk->private; pmd_t pmd = pmdp_get(pmdp); int ret = 0; if (pmd_leaf(pmd)) { if (pmd_cont(pmd)) split_contpmd(pmdp); - ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false); + ret = split_pmd(pmdp, pmd, gfp, false); /* * We have split the pmd directly to ptes so there is no need to @@ -793,9 +816,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, return ret; } -static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, - unsigned long next, - struct mm_walk *walk) +static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) { pte_t pte = __ptep_get(ptep); @@ -805,12 +827,24 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, return 0; } -static const struct mm_walk_ops split_to_ptes_ops __initconst = { +static const struct mm_walk_ops split_to_ptes_ops = { .pud_entry = split_to_ptes_pud_entry, .pmd_entry = split_to_ptes_pmd_entry, .pte_entry = split_to_ptes_pte_entry, }; +static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) +{ + int ret; + + arch_enter_lazy_mmu_mode(); + ret = walk_kernel_page_table_range_lockless(start, end, + &split_to_ptes_ops, NULL, &gfp); + arch_leave_lazy_mmu_mode(); + + return ret; +} + static bool linear_map_requires_bbml2 __initdata; u32 idmap_kpti_bbml2_flag; @@ -847,11 +881,9 @@ static int __init linear_map_split_to_ptes(void *__unused) * PTE. The kernel alias remains static throughout runtime so * can continue to be safely mapped with large mappings. */ - ret = walk_kernel_page_table_range_lockless(lstart, kstart, - &split_to_ptes_ops, NULL, NULL); + ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC); if (!ret) - ret = walk_kernel_page_table_range_lockless(kend, lend, - &split_to_ptes_ops, NULL, NULL); + ret = range_split_to_ptes(kend, lend, GFP_ATOMIC); if (ret) panic("Failed to split linear map\n"); flush_tlb_kernel_range(lstart, lend); @@ -1002,6 +1034,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); __kfence_pool = phys_to_virt(kfence_pool); } + +bool arch_kfence_init_pool(void) +{ + unsigned long start = (unsigned long)__kfence_pool; + unsigned long end = start + KFENCE_POOL_SIZE; + int ret; + + /* Exit early if we know the linear map is already pte-mapped. */ + if (!split_leaf_mapping_possible()) + return true; + + /* Kfence pool is already pte-mapped for the early init case. */ + if (kfence_early_init) + return true; + + mutex_lock(&pgtable_split_lock); + ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL); + mutex_unlock(&pgtable_split_lock); + + /* + * Since the system supports bbml2_noabort, tlb invalidation is not + * required here; the pgtable mappings have been split to pte but larger + * entries may safely linger in the TLB. + */ + + return !ret; +} #else /* CONFIG_KFENCE */ static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } @@ -1009,16 +1068,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { #endif /* CONFIG_KFENCE */ -static inline bool force_pte_mapping(void) -{ - bool bbml2 = system_capabilities_finalized() ? - system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); - - return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() || - is_realm_world())) || - debug_pagealloc_enabled(); -} - static void __init map_mem(pgd_t *pgdp) { static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 8d9088bc577d..8cdfe5d4dac9 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -481,3 +481,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index dc5bd3f1b8d2..96ca1a688984 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -109,7 +109,7 @@ endif ifdef CONFIG_RUSTC_HAS_ANNOTATE_TABLEJUMP KBUILD_RUSTFLAGS += -Cllvm-args=--loongarch-annotate-tablejump else -KBUILD_RUSTFLAGS += -Zno-jump-tables # keep compatibility with older compilers +KBUILD_RUSTFLAGS += $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables) # keep compatibility with older compilers endif ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 3e838c229cd5..50e1304e7a6f 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -917,7 +917,6 @@ CONFIG_MMC=y CONFIG_MMC_LOONGSON2=m CONFIG_INFINIBAND=m CONFIG_EDAC=y -# CONFIG_EDAC_LEGACY_SYSFS is not set CONFIG_EDAC_LOONGSON=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_EFI=y diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index f6f254f2c5db..d090a5bec5eb 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -11,7 +11,7 @@ #else #define __BUGVERBOSE_LOCATION(file, line) \ .pushsection .rodata.str, "aMS", @progbits, 1; \ - 10002: .string file; \ + 10002: .ascii file "\0"; \ .popsection; \ \ .long 10002b - .; \ @@ -20,39 +20,38 @@ #endif #ifndef CONFIG_GENERIC_BUG -#define __BUG_ENTRY(flags) +#define __BUG_ENTRY(cond_str, flags) #else -#define __BUG_ENTRY(flags) \ +#define __BUG_ENTRY(cond_str, flags) \ .pushsection __bug_table, "aw"; \ .align 2; \ 10000: .long 10001f - .; \ - _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - .short flags; \ + _BUGVERBOSE_LOCATION(WARN_CONDITION_STR(cond_str) __FILE__, __LINE__) \ + .short flags; \ .popsection; \ 10001: #endif -#define ASM_BUG_FLAGS(flags) \ - __BUG_ENTRY(flags) \ +#define ASM_BUG_FLAGS(cond_str, flags) \ + __BUG_ENTRY(cond_str, flags) \ break BRK_BUG; -#define ASM_BUG() ASM_BUG_FLAGS(0) +#define ASM_BUG() ASM_BUG_FLAGS("", 0) -#define __BUG_FLAGS(flags, extra) \ - asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \ - extra); +#define __BUG_FLAGS(cond_str, flags, extra) \ + asm_inline volatile (__stringify(ASM_BUG_FLAGS(cond_str, flags)) extra); -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\ + __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\ instrumentation_end(); \ } while (0) #define BUG() \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(0, ""); \ + __BUG_FLAGS("", 0, ""); \ unreachable(); \ } while (0) diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h index fc83bb32f9f0..bd5f0457ad21 100644 --- a/arch/loongarch/include/asm/cpu-features.h +++ b/arch/loongarch/include/asm/cpu-features.h @@ -67,6 +67,8 @@ #define cpu_has_hypervisor cpu_opt(LOONGARCH_CPU_HYPERVISOR) #define cpu_has_ptw cpu_opt(LOONGARCH_CPU_PTW) #define cpu_has_lspw cpu_opt(LOONGARCH_CPU_LSPW) +#define cpu_has_msgint cpu_opt(LOONGARCH_CPU_MSGINT) #define cpu_has_avecint cpu_opt(LOONGARCH_CPU_AVECINT) +#define cpu_has_redirectint cpu_opt(LOONGARCH_CPU_REDIRECTINT) #endif /* __ASM_CPU_FEATURES_H */ diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h index dfb982fe8701..f3efb00b6141 100644 --- a/arch/loongarch/include/asm/cpu.h +++ b/arch/loongarch/include/asm/cpu.h @@ -55,6 +55,27 @@ enum cpu_type_enum { CPU_LAST }; +static inline char *id_to_core_name(unsigned int id) +{ + if ((id & PRID_COMP_MASK) != PRID_COMP_LOONGSON) + return "Unknown"; + + switch (id & PRID_SERIES_MASK) { + case PRID_SERIES_LA132: + return "LA132"; + case PRID_SERIES_LA264: + return "LA264"; + case PRID_SERIES_LA364: + return "LA364"; + case PRID_SERIES_LA464: + return "LA464"; + case PRID_SERIES_LA664: + return "LA664"; + default: + return "Unknown"; + } +} + #endif /* !__ASSEMBLER__ */ /* @@ -101,7 +122,9 @@ enum cpu_type_enum { #define CPU_FEATURE_HYPERVISOR 26 /* CPU has hypervisor (running in VM) */ #define CPU_FEATURE_PTW 27 /* CPU has hardware page table walker */ #define CPU_FEATURE_LSPW 28 /* CPU has LSPW (lddir/ldpte instructions) */ -#define CPU_FEATURE_AVECINT 29 /* CPU has AVEC interrupt */ +#define CPU_FEATURE_MSGINT 29 /* CPU has MSG interrupt */ +#define CPU_FEATURE_AVECINT 30 /* CPU has AVEC interrupt */ +#define CPU_FEATURE_REDIRECTINT 31 /* CPU has interrupt remapping */ #define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG) #define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM) @@ -132,6 +155,8 @@ enum cpu_type_enum { #define LOONGARCH_CPU_HYPERVISOR BIT_ULL(CPU_FEATURE_HYPERVISOR) #define LOONGARCH_CPU_PTW BIT_ULL(CPU_FEATURE_PTW) #define LOONGARCH_CPU_LSPW BIT_ULL(CPU_FEATURE_LSPW) +#define LOONGARCH_CPU_MSGINT BIT_ULL(CPU_FEATURE_MSGINT) #define LOONGARCH_CPU_AVECINT BIT_ULL(CPU_FEATURE_AVECINT) +#define LOONGARCH_CPU_REDIRECTINT BIT_ULL(CPU_FEATURE_REDIRECTINT) #endif /* _ASM_CPU_H */ diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h index 13b2462f3d8c..5faa97a87a9e 100644 --- a/arch/loongarch/include/asm/hw_breakpoint.h +++ b/arch/loongarch/include/asm/hw_breakpoint.h @@ -134,13 +134,13 @@ static inline void hw_breakpoint_thread_switch(struct task_struct *next) /* Determine number of BRP registers available. */ static inline int get_num_brps(void) { - return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; + return csr_read32(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM; } /* Determine number of WRP registers available. */ static inline int get_num_wrps(void) { - return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; + return csr_read32(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM; } #endif /* __KERNEL__ */ diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index eaff72b38dc8..0130185e0349 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -14,7 +14,7 @@ #include <asm/pgtable-bits.h> #include <asm/string.h> -extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size); +extern void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size); extern void __init early_iounmap(void __iomem *addr, unsigned long size); #define early_memremap early_ioremap @@ -25,6 +25,9 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size); static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, pgprot_t prot) { + if (offset > TO_PHYS_MASK) + return NULL; + switch (pgprot_val(prot) & _CACHE_MASK) { case _CACHE_CC: return (void __iomem *)(unsigned long)(CACHE_BASE + offset); diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 09dfd7eb406e..3de03cb864b2 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -128,6 +128,7 @@ #define CPUCFG6_PMNUM GENMASK(7, 4) #define CPUCFG6_PMNUM_SHIFT 4 #define CPUCFG6_PMBITS GENMASK(13, 8) +#define CPUCFG6_PMBITS_SHIFT 8 #define CPUCFG6_UPM BIT(14) #define LOONGARCH_CPUCFG16 0x10 @@ -1137,6 +1138,7 @@ #define IOCSRF_FLATMODE BIT_ULL(10) #define IOCSRF_VM BIT_ULL(11) #define IOCSRF_AVEC BIT_ULL(15) +#define IOCSRF_REDIRECT BIT_ULL(16) #define LOONGARCH_IOCSR_VENDOR 0x10 diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 1c63a9d9a6d3..08dcc698ec18 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -88,7 +88,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (!ptdesc) return NULL; diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index bd128696e96d..03fb60432fde 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -424,6 +424,9 @@ static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { + if (pte_val(pte) & _PAGE_DIRTY) + pte_val(pte) |= _PAGE_MODIFIED; + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | (pgprot_val(newprot) & ~_PAGE_CHG_MASK)); } @@ -547,9 +550,11 @@ static inline struct page *pmd_page(pmd_t pmd) static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmd_val(pmd) = (pmd_val(pmd) & _HPAGE_CHG_MASK) | - (pgprot_val(newprot) & ~_HPAGE_CHG_MASK); - return pmd; + if (pmd_val(pmd) & _PAGE_DIRTY) + pmd_val(pmd) |= _PAGE_MODIFIED; + + return __pmd((pmd_val(pmd) & _HPAGE_CHG_MASK) | + (pgprot_val(newprot) & ~_HPAGE_CHG_MASK)); } static inline pmd_t pmd_mkinvalid(pmd_t pmd) diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h index aafb3cd9e943..215e0f9e8aa3 100644 --- a/arch/loongarch/include/uapi/asm/ptrace.h +++ b/arch/loongarch/include/uapi/asm/ptrace.h @@ -10,10 +10,6 @@ #include <linux/types.h> -#ifndef __KERNEL__ -#include <stdint.h> -#endif - /* * For PTRACE_{POKE,PEEK}USR. 0 - 31 are GPRs, * 32 is syscall's original ARG0, 33 is PC, 34 is BADVADDR. @@ -41,44 +37,44 @@ struct user_pt_regs { } __attribute__((aligned(8))); struct user_fp_state { - uint64_t fpr[32]; - uint64_t fcc; - uint32_t fcsr; + __u64 fpr[32]; + __u64 fcc; + __u32 fcsr; }; struct user_lsx_state { /* 32 registers, 128 bits width per register. */ - uint64_t vregs[32*2]; + __u64 vregs[32*2]; }; struct user_lasx_state { /* 32 registers, 256 bits width per register. */ - uint64_t vregs[32*4]; + __u64 vregs[32*4]; }; struct user_lbt_state { - uint64_t scr[4]; - uint32_t eflags; - uint32_t ftop; + __u64 scr[4]; + __u32 eflags; + __u32 ftop; }; struct user_watch_state { - uint64_t dbg_info; + __u64 dbg_info; struct { - uint64_t addr; - uint64_t mask; - uint32_t ctrl; - uint32_t pad; + __u64 addr; + __u64 mask; + __u32 ctrl; + __u32 pad; } dbg_regs[8]; }; struct user_watch_state_v2 { - uint64_t dbg_info; + __u64 dbg_info; struct { - uint64_t addr; - uint64_t mask; - uint32_t ctrl; - uint32_t pad; + __u64 addr; + __u64 mask; + __u32 ctrl; + __u32 pad; } dbg_regs[14]; }; diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index cbfce2872d71..a2060a24b39f 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -157,6 +157,8 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c) c->options |= LOONGARCH_CPU_TLB; if (config & CPUCFG1_IOCSR) c->options |= LOONGARCH_CPU_IOCSR; + if (config & CPUCFG1_MSGINT) + c->options |= LOONGARCH_CPU_MSGINT; if (config & CPUCFG1_UAL) { c->options |= LOONGARCH_CPU_UAL; elf_hwcap |= HWCAP_LOONGARCH_UAL; @@ -275,7 +277,7 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int uint32_t config; uint64_t *vendor = (void *)(&cpu_full_name[VENDOR_OFFSET]); uint64_t *cpuname = (void *)(&cpu_full_name[CPUNAME_OFFSET]); - const char *core_name = "Unknown"; + const char *core_name = id_to_core_name(c->processor_id); switch (BIT(fls(c->isa_level) - 1)) { case LOONGARCH_CPU_ISA_LA32R: @@ -289,35 +291,23 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int break; } - switch (c->processor_id & PRID_SERIES_MASK) { - case PRID_SERIES_LA132: - core_name = "LA132"; - break; - case PRID_SERIES_LA264: - core_name = "LA264"; - break; - case PRID_SERIES_LA364: - core_name = "LA364"; - break; - case PRID_SERIES_LA464: - core_name = "LA464"; - break; - case PRID_SERIES_LA664: - core_name = "LA664"; - break; - } - pr_info("%s Processor probed (%s Core)\n", __cpu_family[cpu], core_name); - if (!cpu_has_iocsr) + if (!cpu_has_iocsr) { + __cpu_full_name[cpu] = "Unknown"; return; - - if (!__cpu_full_name[cpu]) - __cpu_full_name[cpu] = cpu_full_name; + } *vendor = iocsr_read64(LOONGARCH_IOCSR_VENDOR); *cpuname = iocsr_read64(LOONGARCH_IOCSR_CPUNAME); + if (!__cpu_full_name[cpu]) { + if (((char *)vendor)[0] == 0) + __cpu_full_name[cpu] = "Unknown"; + else + __cpu_full_name[cpu] = cpu_full_name; + } + config = iocsr_read32(LOONGARCH_IOCSR_FEATURES); if (config & IOCSRF_CSRIPI) c->options |= LOONGARCH_CPU_CSRIPI; @@ -331,6 +321,8 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int c->options |= LOONGARCH_CPU_EIODECODE; if (config & IOCSRF_AVEC) c->options |= LOONGARCH_CPU_AVECINT; + if (config & IOCSRF_REDIRECT) + c->options |= LOONGARCH_CPU_REDIRECTINT; if (config & IOCSRF_VM) c->options |= LOONGARCH_CPU_HYPERVISOR; } diff --git a/arch/loongarch/kernel/kexec_efi.c b/arch/loongarch/kernel/kexec_efi.c index 45121b914f8f..5ee78ebb1546 100644 --- a/arch/loongarch/kernel/kexec_efi.c +++ b/arch/loongarch/kernel/kexec_efi.c @@ -42,7 +42,7 @@ static void *efi_kexec_load(struct kimage *image, { int ret; unsigned long text_offset, kernel_segment_number; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; struct kexec_segment *kernel_segment; struct loongarch_image_header *h; diff --git a/arch/loongarch/kernel/kexec_elf.c b/arch/loongarch/kernel/kexec_elf.c index 97b2f049801a..1b6b64744c7f 100644 --- a/arch/loongarch/kernel/kexec_elf.c +++ b/arch/loongarch/kernel/kexec_elf.c @@ -59,7 +59,7 @@ static void *elf_kexec_load(struct kimage *image, int ret; unsigned long text_offset, kernel_segment_number; struct elfhdr ehdr; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; struct kexec_elf_info elf_info; struct kexec_segment *kernel_segment; diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c index e4b2bbc47e62..d7fafda1d541 100644 --- a/arch/loongarch/kernel/machine_kexec.c +++ b/arch/loongarch/kernel/machine_kexec.c @@ -39,34 +39,12 @@ static unsigned long systable_ptr; static unsigned long start_addr; static unsigned long first_ind_entry; -static void kexec_image_info(const struct kimage *kimage) -{ - unsigned long i; - - pr_debug("kexec kimage info:\n"); - pr_debug("\ttype: %d\n", kimage->type); - pr_debug("\tstart: %lx\n", kimage->start); - pr_debug("\thead: %lx\n", kimage->head); - pr_debug("\tnr_segments: %lu\n", kimage->nr_segments); - - for (i = 0; i < kimage->nr_segments; i++) { - pr_debug("\t segment[%lu]: %016lx - %016lx", i, - kimage->segment[i].mem, - kimage->segment[i].mem + kimage->segment[i].memsz); - pr_debug("\t\t0x%lx bytes, %lu pages\n", - (unsigned long)kimage->segment[i].memsz, - (unsigned long)kimage->segment[i].memsz / PAGE_SIZE); - } -} - int machine_kexec_prepare(struct kimage *kimage) { int i; char *bootloader = "kexec"; void *cmdline_ptr = (void *)KEXEC_CMDLINE_ADDR; - kexec_image_info(kimage); - kimage->arch.efi_boot = fw_arg0; kimage->arch.systable_ptr = fw_arg2; @@ -259,6 +237,7 @@ void machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_SMP crash_smp_send_stop(); #endif + machine_kexec_mask_interrupts(); cpumask_set_cpu(crashing_cpu, &cpus_in_crash); pr_info("Starting crashdump kernel...\n"); @@ -296,6 +275,7 @@ void machine_kexec(struct kimage *image) /* We do not want to be bothered. */ local_irq_disable(); + machine_kexec_mask_interrupts(); pr_notice("EFI boot flag: 0x%lx\n", efi_boot); pr_notice("Command line addr: 0x%lx\n", cmdline_ptr); diff --git a/arch/loongarch/kernel/machine_kexec_file.c b/arch/loongarch/kernel/machine_kexec_file.c index dda236b51a88..fb57026f5f25 100644 --- a/arch/loongarch/kernel/machine_kexec_file.c +++ b/arch/loongarch/kernel/machine_kexec_file.c @@ -143,7 +143,7 @@ int load_other_segments(struct kimage *image, unsigned long initrd_load_addr = 0; unsigned long orig_segments = image->nr_segments; char *modified_cmdline = NULL; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; kbuf.image = image; /* Don't allocate anything below the kernel */ diff --git a/arch/loongarch/kernel/mem.c b/arch/loongarch/kernel/mem.c index aed901c57fb4..8ab1ffedc52c 100644 --- a/arch/loongarch/kernel/mem.c +++ b/arch/loongarch/kernel/mem.c @@ -13,7 +13,7 @@ void __init memblock_init(void) { u32 mem_type; - u64 mem_start, mem_end, mem_size; + u64 mem_start, mem_size; efi_memory_desc_t *md; /* Parse memory information */ @@ -21,7 +21,6 @@ void __init memblock_init(void) mem_type = md->type; mem_start = md->phys_addr; mem_size = md->num_pages << EFI_PAGE_SHIFT; - mem_end = mem_start + mem_size; switch (mem_type) { case EFI_LOADER_CODE: @@ -31,8 +30,6 @@ void __init memblock_init(void) case EFI_PERSISTENT_MEMORY: case EFI_CONVENTIONAL_MEMORY: memblock_add(mem_start, mem_size); - if (max_low_pfn < (mem_end >> PAGE_SHIFT)) - max_low_pfn = mem_end >> PAGE_SHIFT; break; case EFI_PAL_CODE: case EFI_UNUSABLE_MEMORY: @@ -49,6 +46,8 @@ void __init memblock_init(void) } } + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); memblock_set_current_limit(PFN_PHYS(max_low_pfn)); /* Reserve the first 2MB */ diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index d6e73e8f9c0b..8b89898e20df 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -158,35 +158,9 @@ static void __init node_mem_init(unsigned int node) #ifdef CONFIG_ACPI_NUMA -/* - * add_numamem_region - * - * Add a uasable memory region described by BIOS. The - * routine gets each intersection between BIOS's region - * and node's region, and adds them into node's memblock - * pool. - * - */ -static void __init add_numamem_region(u64 start, u64 end, u32 type) -{ - u32 node = pa_to_nid(start); - u64 size = end - start; - static unsigned long num_physpages; - - if (start >= end) { - pr_debug("Invalid region: %016llx-%016llx\n", start, end); - return; - } - - num_physpages += (size >> PAGE_SHIFT); - pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", - node, type, start, size); - pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", - start >> PAGE_SHIFT, end >> PAGE_SHIFT, num_physpages); - memblock_set_node(start, size, &memblock.memory, node); -} +static unsigned long num_physpages; -static void __init init_node_memblock(void) +static void __init info_node_memblock(void) { u32 mem_type; u64 mem_end, mem_start, mem_size; @@ -206,12 +180,20 @@ static void __init init_node_memblock(void) case EFI_BOOT_SERVICES_DATA: case EFI_PERSISTENT_MEMORY: case EFI_CONVENTIONAL_MEMORY: - add_numamem_region(mem_start, mem_end, mem_type); + num_physpages += (mem_size >> PAGE_SHIFT); + pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", + (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size); + pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", + mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages); break; case EFI_PAL_CODE: case EFI_UNUSABLE_MEMORY: case EFI_ACPI_RECLAIM_MEMORY: - add_numamem_region(mem_start, mem_end, mem_type); + num_physpages += (mem_size >> PAGE_SHIFT); + pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", + (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size); + pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", + mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages); fallthrough; case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -249,22 +231,16 @@ int __init init_numa_memory(void) for (i = 0; i < NR_CPUS; i++) set_cpuid_to_node(i, NUMA_NO_NODE); - numa_reset_distance(); - nodes_clear(numa_nodes_parsed); - nodes_clear(node_possible_map); - nodes_clear(node_online_map); - WARN_ON(memblock_clear_hotplug(0, PHYS_ADDR_MAX)); - /* Parse SRAT and SLIT if provided by firmware. */ - ret = acpi_disabled ? fake_numa_init() : acpi_numa_init(); + if (!acpi_disabled) + ret = numa_memblks_init(acpi_numa_init, false); + else + ret = numa_memblks_init(fake_numa_init, false); + if (ret < 0) return ret; - node_possible_map = numa_nodes_parsed; - if (WARN_ON(nodes_empty(node_possible_map))) - return -EINVAL; - - init_node_memblock(); + info_node_memblock(); if (!memblock_validate_numa_coverage(SZ_1M)) return -EINVAL; @@ -272,7 +248,8 @@ int __init init_numa_memory(void) node_mem_init(node); node_set_online(node); } - max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); setup_nr_node_ids(); loongson_sysconf.nr_nodes = nr_node_ids; @@ -283,26 +260,6 @@ int __init init_numa_memory(void) #endif -void __init paging_init(void) -{ - unsigned int node; - unsigned long zones_size[MAX_NR_ZONES] = {0, }; - - for_each_online_node(node) { - unsigned long start_pfn, end_pfn; - - get_pfn_range_for_nid(node, &start_pfn, &end_pfn); - - if (end_pfn > max_low_pfn) - max_low_pfn = end_pfn; - } -#ifdef CONFIG_ZONE_DMA32 - zones_size[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - zones_size[ZONE_NORMAL] = max_low_pfn; - free_area_init(zones_size); -} - int pcibus_to_node(struct pci_bus *bus) { return dev_to_node(&bus->dev); diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c index 8ad098703488..9d257c8519c9 100644 --- a/arch/loongarch/kernel/perf_event.c +++ b/arch/loongarch/kernel/perf_event.c @@ -845,13 +845,14 @@ static const struct loongarch_perf_event *loongarch_pmu_map_raw_event(u64 config static int __init init_hw_perf_events(void) { - int counters; + int bits, counters; if (!cpu_has_pmp) return -ENODEV; pr_info("Performance counters: "); - counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> 4) + 1; + bits = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMBITS) >> CPUCFG6_PMBITS_SHIFT) + 1; + counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT) + 1; loongarch_pmu.num_counters = counters; loongarch_pmu.max_period = (1ULL << 63) - 1; @@ -867,7 +868,7 @@ static int __init init_hw_perf_events(void) on_each_cpu(reset_counters, NULL, 1); pr_cont("%s PMU enabled, %d %d-bit counters available to each CPU.\n", - loongarch_pmu.name, counters, 64); + loongarch_pmu.name, counters, bits); perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index cea30768ae92..63d2b7e7e844 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -17,6 +17,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; unsigned int isa = cpu_data[n].isa_level; + unsigned int prid = cpu_data[n].processor_id; unsigned int version = cpu_data[n].processor_id & 0xff; unsigned int fp_version = cpu_data[n].fpu_vers; @@ -37,6 +38,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "global_id\t\t: %d\n", cpu_data[n].global_id); seq_printf(m, "CPU Family\t\t: %s\n", __cpu_family[n]); seq_printf(m, "Model Name\t\t: %s\n", __cpu_full_name[n]); + seq_printf(m, "PRID\t\t\t: %s (%08x)\n", id_to_core_name(prid), prid); seq_printf(m, "CPU Revision\t\t: 0x%02x\n", version); seq_printf(m, "FPU Revision\t\t: 0x%02x\n", fp_version); seq_printf(m, "CPU MHz\t\t\t: %llu.%02llu\n", diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 69c17d162fff..25a87378e48e 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -294,8 +294,6 @@ static void __init fdt_setup(void) early_init_dt_scan(fdt_pointer, __pa(fdt_pointer)); early_init_fdt_reserve_self(); - - max_low_pfn = PFN_PHYS(memblock_end_of_DRAM()); #endif } @@ -390,7 +388,8 @@ static void __init check_kernel_sections_mem(void) static void __init arch_mem_init(char **cmdline_p) { /* Recalculate max_low_pfn for "mem=xxx" */ - max_pfn = max_low_pfn = PHYS_PFN(memblock_end_of_DRAM()); + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn); if (usermem) pr_info("User-defined physical RAM map overwrite\n"); diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 3d9be6ca7ec5..da5926fead4a 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -1131,8 +1131,8 @@ static void configure_exception_vector(void) tlbrentry = (unsigned long)exception_handlers + 80*VECSIZE; csr_write64(eentry, LOONGARCH_CSR_EENTRY); - csr_write64(eentry, LOONGARCH_CSR_MERRENTRY); - csr_write64(tlbrentry, LOONGARCH_CSR_TLBRENTRY); + csr_write64(__pa(eentry), LOONGARCH_CSR_MERRENTRY); + csr_write64(__pa(tlbrentry), LOONGARCH_CSR_TLBRENTRY); } void per_cpu_trap_init(int cpu) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index c32333695381..a1cc116b4dac 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -439,7 +439,7 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: - if (val >= EIOINTC_ROUTE_MAX_VCPUS) + if (val > EIOINTC_ROUTE_MAX_VCPUS) ret = -EINVAL; else s->num_cpu = val; diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 7c8143e79c12..a7fa458e3360 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -857,7 +857,7 @@ retry: if (writeable) { prot_bits = kvm_pte_mkwriteable(prot_bits); - if (write) + if (write || !kvm_slot_dirty_track_enabled(memslot)) prot_bits = kvm_pte_mkdirty(prot_bits); } diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 32dc213374be..29c2aaba63c3 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -4,6 +4,7 @@ */ #include <linux/kvm_host.h> +#include <asm/delay.h> #include <asm/kvm_csr.h> #include <asm/kvm_vcpu.h> @@ -95,6 +96,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) * and set CSR TVAL with -1 */ write_gcsr_timertick(0); + __delay(2); /* Wait cycles until timer interrupt injected */ /* * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..1245a6b35896 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -132,6 +132,9 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu) * Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when * exiting the guest, so that the next time trap into the guest. * We don't need to deal with PMU CSRs contexts. + * + * Otherwise set the request bit KVM_REQ_PMU to restore guest PMU + * before entering guest VM */ val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0); val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1); @@ -139,16 +142,12 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu) val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3); if (!(val & KVM_PMU_EVENT_ENABLED)) vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU; + else + kvm_make_request(KVM_REQ_PMU, vcpu); kvm_restore_host_pmu(vcpu); } -static void kvm_restore_pmu(struct kvm_vcpu *vcpu) -{ - if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU)) - kvm_make_request(KVM_REQ_PMU, vcpu); -} - static void kvm_check_pmu(struct kvm_vcpu *vcpu) { if (kvm_check_request(KVM_REQ_PMU, vcpu)) { @@ -299,7 +298,10 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { - kvm_lose_pmu(vcpu); + if (vcpu->arch.aux_inuse & KVM_LARCH_PMU) { + kvm_lose_pmu(vcpu); + kvm_make_request(KVM_REQ_PMU, vcpu); + } /* make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); @@ -1604,9 +1606,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_timer(vcpu); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - /* Restore hardware PMU CSRs */ - kvm_restore_pmu(vcpu); - /* Don't bother restoring registers multiple times unless necessary */ if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE) return 0; diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index c3e4586a7975..6bfd4b8dad1b 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -60,7 +60,6 @@ int __ref page_is_ram(unsigned long pfn) return memblock_is_memory(addr) && !memblock_is_reserved(addr); } -#ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -72,7 +71,6 @@ void __init paging_init(void) free_area_init(max_zone_pfns); } -#endif /* !CONFIG_NUMA */ void __ref free_initmem(void) { diff --git a/arch/loongarch/mm/ioremap.c b/arch/loongarch/mm/ioremap.c index df949a3d0f34..27c336959fe8 100644 --- a/arch/loongarch/mm/ioremap.c +++ b/arch/loongarch/mm/ioremap.c @@ -6,7 +6,7 @@ #include <asm/io.h> #include <asm-generic/early_ioremap.h> -void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size) +void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size) { return ((void __iomem *)TO_CACHE(phys_addr)); } diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index cbe53d0b7fb0..f97dc9936401 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1624,6 +1624,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i /* Direct jump skips 5 NOP instructions */ else if (is_bpf_text_address((unsigned long)orig_call)) orig_call += LOONGARCH_BPF_FENTRY_NBYTES; + /* Module tracing not supported - cause kernel lockups */ + else if (is_module_text_address((unsigned long)orig_call)) + return -ENOTSUPP; if (flags & BPF_TRAMP_F_CALL_ORIG) { move_addr(ctx, LOONGARCH_GPR_A0, (const u64)im); diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index 5bc9627a6cf9..d9fc5d520b37 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -50,11 +50,11 @@ static int __init pcibios_init(void) */ lsize = cpu_last_level_cache_line_size(); - BUG_ON(!lsize); + if (lsize) { + pci_dfl_cache_line_size = lsize >> 2; - pci_dfl_cache_line_size = lsize >> 2; - - pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize); + pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize); + } return 0; } diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index d8316f993482..c0cc3ca5da9f 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -19,7 +19,7 @@ ccflags-vdso := \ cflags-vdso := $(ccflags-vdso) \ -isystem $(shell $(CC) -print-file-name=include) \ $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \ - -std=gnu11 -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ + -std=gnu11 -fms-extensions -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ -fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \ $(call cc-option, -fno-asynchronous-unwind-tables) \ $(call cc-option, -fno-stack-protector) diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index f41d38dfbf13..871a5d67bf41 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -469,3 +469,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 580af574fe73..022fc85d94b3 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -475,3 +475,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi index 66197e73d4f0..2abeef5b744a 100644 --- a/arch/mips/boot/dts/econet/en751221.dtsi +++ b/arch/mips/boot/dts/econet/en751221.dtsi @@ -18,7 +18,7 @@ cpu@0 { device_type = "cpu"; - compatible = "mips,mips24KEc"; + compatible = "mips,mips34Kc"; reg = <0>; }; }; diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 29191fa1801e..a3101f2268c6 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -692,7 +692,7 @@ unsigned long mips_stack_top(void) /* Space for the VDSO, data page & GIC user page */ if (current->thread.abi) { top -= PAGE_ALIGN(current->thread.abi->vdso->size); - top -= PAGE_SIZE; + top -= VDSO_NR_PAGES * PAGE_SIZE; top -= mips_gic_present() ? PAGE_SIZE : 0; /* Space to randomize the VDSO base */ diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index d824ffe9a014..8cedc83c3266 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -408,3 +408,4 @@ 467 n32 open_tree_attr sys_open_tree_attr 468 n32 file_getattr sys_file_getattr 469 n32 file_setattr sys_file_setattr +470 n32 listns sys_listns diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 7a7049c2c307..9b92bddf06b5 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -384,3 +384,4 @@ 467 n64 open_tree_attr sys_open_tree_attr 468 n64 file_getattr sys_file_getattr 469 n64 file_setattr sys_file_setattr +470 n64 listns sys_listns diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index d330274f0601..f810b8a55716 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -457,3 +457,4 @@ 467 o32 open_tree_attr sys_open_tree_attr 468 o32 file_getattr sys_file_getattr 469 o32 file_setattr sys_file_setattr +470 o32 listns sys_listns diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 347126dc010d..44a662536148 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -12,9 +12,11 @@ #include <linux/init.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/export.h> +#include <linux/sort.h> #include <asm/cpu.h> #include <asm/cpu-type.h> @@ -508,58 +510,95 @@ static int __init set_ntlb(char *str) __setup("ntlb=", set_ntlb); -/* Initialise all TLB entries with unique values */ -static void r4k_tlb_uniquify(void) + +/* Comparison function for EntryHi VPN fields. */ +static int r4k_vpn_cmp(const void *a, const void *b) { - int entry = num_wired_entries(); + long v = *(unsigned long *)a - *(unsigned long *)b; + int s = sizeof(long) > sizeof(int) ? sizeof(long) * 8 - 1: 0; + return s ? (v != 0) | v >> s : v; +} + +/* + * Initialise all TLB entries with unique values that do not clash with + * what we have been handed over and what we'll be using ourselves. + */ +static void __ref r4k_tlb_uniquify(void) +{ + int tlbsize = current_cpu_data.tlbsize; + bool use_slab = slab_is_available(); + int start = num_wired_entries(); + phys_addr_t tlb_vpn_size; + unsigned long *tlb_vpns; + unsigned long vpn_mask; + int cnt, ent, idx, i; + + vpn_mask = GENMASK(cpu_vmbits - 1, 13); + vpn_mask |= IS_ENABLED(CONFIG_64BIT) ? 3ULL << 62 : 1 << 31; + + tlb_vpn_size = tlbsize * sizeof(*tlb_vpns); + tlb_vpns = (use_slab ? + kmalloc(tlb_vpn_size, GFP_KERNEL) : + memblock_alloc_raw(tlb_vpn_size, sizeof(*tlb_vpns))); + if (WARN_ON(!tlb_vpns)) + return; /* Pray local_flush_tlb_all() is good enough. */ htw_stop(); + + for (i = start, cnt = 0; i < tlbsize; i++, cnt++) { + unsigned long vpn; + + write_c0_index(i); + mtc0_tlbr_hazard(); + tlb_read(); + tlb_read_hazard(); + vpn = read_c0_entryhi(); + vpn &= vpn_mask & PAGE_MASK; + tlb_vpns[cnt] = vpn; + + /* Prevent any large pages from overlapping regular ones. */ + write_c0_pagemask(read_c0_pagemask() & PM_DEFAULT_MASK); + mtc0_tlbw_hazard(); + tlb_write_indexed(); + tlbw_use_hazard(); + } + + sort(tlb_vpns, cnt, sizeof(tlb_vpns[0]), r4k_vpn_cmp, NULL); + + write_c0_pagemask(PM_DEFAULT_MASK); write_c0_entrylo0(0); write_c0_entrylo1(0); - while (entry < current_cpu_data.tlbsize) { - unsigned long asid_mask = cpu_asid_mask(¤t_cpu_data); - unsigned long asid = 0; - int idx; + idx = 0; + ent = tlbsize; + for (i = start; i < tlbsize; i++) + while (1) { + unsigned long entryhi, vpn; - /* Skip wired MMID to make ginvt_mmid work */ - if (cpu_has_mmid) - asid = MMID_KERNEL_WIRED + 1; + entryhi = UNIQUE_ENTRYHI(ent); + vpn = entryhi & vpn_mask & PAGE_MASK; - /* Check for match before using UNIQUE_ENTRYHI */ - do { - if (cpu_has_mmid) { - write_c0_memorymapid(asid); - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); + if (idx >= cnt || vpn < tlb_vpns[idx]) { + write_c0_entryhi(entryhi); + write_c0_index(i); + mtc0_tlbw_hazard(); + tlb_write_indexed(); + ent++; + break; + } else if (vpn == tlb_vpns[idx]) { + ent++; } else { - write_c0_entryhi(UNIQUE_ENTRYHI(entry) | asid); + idx++; } - mtc0_tlbw_hazard(); - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - /* No match or match is on current entry */ - if (idx < 0 || idx == entry) - break; - /* - * If we hit a match, we need to try again with - * a different ASID. - */ - asid++; - } while (asid < asid_mask); - - if (idx >= 0 && idx != entry) - panic("Unable to uniquify TLB entry %d", idx); - - write_c0_index(entry); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - entry++; - } + } tlbw_use_hazard(); htw_start(); flush_micro_tlb(); + if (use_slab) + kfree(tlb_vpns); + else + memblock_free(tlb_vpns, tlb_vpn_size); } /* @@ -602,6 +641,7 @@ static void r4k_tlb_configure(void) /* From this point on the ARC firmware is dead. */ r4k_tlb_uniquify(); + local_flush_tlb_all(); /* Did I tell you that ARC SUCKS? */ } diff --git a/arch/mips/mti-malta/malta-init.c b/arch/mips/mti-malta/malta-init.c index 000d6d50520a..82b0fd8576a2 100644 --- a/arch/mips/mti-malta/malta-init.c +++ b/arch/mips/mti-malta/malta-init.c @@ -241,16 +241,22 @@ mips_pci_controller: #endif /* - * Setup the Malta max (2GB) memory for PCI DMA in host bridge - * in transparent addressing mode. + * Set up memory mapping in host bridge for PCI DMA masters, + * in transparent addressing mode. For EVA use the Malta + * maximum of 2 GiB memory in the alias space at 0x80000000 + * as per PHYS_OFFSET. Otherwise use 256 MiB of memory in + * the regular space, avoiding mapping the PCI MMIO window + * for DMA as it seems to confuse the system controller's + * logic, causing PCI MMIO to stop working. */ - mask = PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH; - MSC_WRITE(MSC01_PCI_BAR0, mask); - MSC_WRITE(MSC01_PCI_HEAD4, mask); + mask = PHYS_OFFSET ? PHYS_OFFSET : 0xf0000000; + MSC_WRITE(MSC01_PCI_BAR0, + mask | PCI_BASE_ADDRESS_MEM_PREFETCH); + MSC_WRITE(MSC01_PCI_HEAD4, + PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH); - mask &= MSC01_PCI_BAR0_SIZE_MSK; MSC_WRITE(MSC01_PCI_P2SCMSKL, mask); - MSC_WRITE(MSC01_PCI_P2SCMAPL, mask); + MSC_WRITE(MSC01_PCI_P2SCMAPL, PHYS_OFFSET); /* Don't handle target retries indefinitely. */ if ((data & MSC01_PCI_CFG_MAXRTRY_MSK) == diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 17c42d718eb3..f8481e4e9d21 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -18,7 +18,7 @@ KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os ifndef CONFIG_64BIT KBUILD_CFLAGS += -mfast-indirect-calls endif -KBUILD_CFLAGS += -std=gnu11 +KBUILD_CFLAGS += -std=gnu11 -fms-extensions LDFLAGS_vmlinux := -X -e startup --as-needed -T $(obj)/vmlinux: $(obj)/vmlinux.lds $(addprefix $(obj)/, $(OBJECTS)) $(LIBGCC) FORCE diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 833555f74ffa..5aa1623e4f2f 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -50,7 +50,7 @@ #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ @@ -61,12 +61,12 @@ "\t.short %1, %2\n" \ "\t.blockz %3-2*4-2*2\n" \ "\t.popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry)) ); \ } while(0) #else -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 88a788a7b18d..39bdacaa530b 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -468,3 +468,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index f7e0fee5ee55..7ac88ff13d3c 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -35,6 +35,8 @@ #define KERNEL_START (KERNEL_BINARY_TEXT_START) +#define ALIGNMENT_OK(ptr, type) (((ptr) & (sizeof(type) - 1)) == 0) + extern struct unwind_table_entry __start___unwind[]; extern struct unwind_table_entry __stop___unwind[]; @@ -257,12 +259,15 @@ static int unwind_special(struct unwind_frame_info *info, unsigned long pc, int if (pc_is_kernel_fn(pc, _switch_to) || pc == (unsigned long)&_switch_to_ret) { info->prev_sp = info->sp - CALLEE_SAVE_FRAME_SIZE; - info->prev_ip = *(unsigned long *)(info->prev_sp - RP_OFFSET); + if (ALIGNMENT_OK(info->prev_sp, long)) + info->prev_ip = *(unsigned long *)(info->prev_sp - RP_OFFSET); + else + info->prev_ip = info->prev_sp = 0; return 1; } #ifdef CONFIG_IRQSTACKS - if (pc == (unsigned long)&_call_on_stack) { + if (pc == (unsigned long)&_call_on_stack && ALIGNMENT_OK(info->sp, long)) { info->prev_sp = *(unsigned long *)(info->sp - FRAME_SIZE - REG_SZ); info->prev_ip = *(unsigned long *)(info->sp - FRAME_SIZE - RP_OFFSET); return 1; @@ -370,8 +375,10 @@ static void unwind_frame_regs(struct unwind_frame_info *info) info->prev_sp = info->sp - frame_size; if (e->Millicode) info->rp = info->r31; - else if (rpoffset) + else if (rpoffset && ALIGNMENT_OK(info->prev_sp, long)) info->rp = *(unsigned long *)(info->prev_sp - rpoffset); + else + info->rp = 0; info->prev_ip = info->rp; info->rp = 0; } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e24f4d88885a..9537a61ebae0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_DMA_OPS if PPC64 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE if ARCH_SUPPORTS_HUGETLBFS select ARCH_HAS_KCOV select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU select ARCH_HAS_MEMBARRIER_CALLBACKS diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index c47b78c1d3e7..f1a4761ebd44 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -70,7 +70,7 @@ BOOTCPPFLAGS := -nostdinc $(LINUXINCLUDE) BOOTCPPFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include) BOOTCFLAGS := $(BOOTTARGETFLAGS) \ - -std=gnu11 \ + -std=gnu11 -fms-extensions \ -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -O2 \ -msoft-float -mno-altivec -mno-vsx \ @@ -86,6 +86,7 @@ BOOTARFLAGS := -crD ifdef CONFIG_CC_IS_CLANG BOOTCFLAGS += $(CLANG_FLAGS) +BOOTCFLAGS += -Wno-microsoft-anon-tag BOOTAFLAGS += $(CLANG_FLAGS) endif diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index bbaa7e81f821..0db48977c70c 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -51,11 +51,11 @@ ".previous\n" #endif -#define BUG_ENTRY(insn, flags, ...) \ +#define BUG_ENTRY(cond_str, insn, flags, ...) \ __asm__ __volatile__( \ "1: " insn "\n" \ _EMIT_BUG_ENTRY \ - : : "i" (__FILE__), "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry)), \ ##__VA_ARGS__) @@ -67,12 +67,12 @@ */ #define BUG() do { \ - BUG_ENTRY("twi 31, 0, 0", 0); \ + BUG_ENTRY("", "twi 31, 0, 0", 0); \ unreachable(); \ } while (0) #define HAVE_ARCH_BUG -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) +#define __WARN_FLAGS(cond_str, flags) BUG_ENTRY(cond_str, "twi 31, 0, 0", BUGFLAG_WARNING | (flags)) #ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ @@ -80,7 +80,7 @@ if (x) \ BUG(); \ } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \ + BUG_ENTRY(#x, PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \ } \ } while (0) @@ -90,7 +90,7 @@ if (__ret_warn_on) \ __WARN(); \ } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", \ + BUG_ENTRY(#x, PPC_TLNEI " %4, 0", \ BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ "r" (__ret_warn_on)); \ } \ diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 4f5a46a77fa2..784a00e681fa 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -451,7 +451,7 @@ user_write_access_begin(const void __user *ptr, size_t len) #define user_write_access_begin user_write_access_begin #define user_write_access_end prevent_current_write_to_user -#define unsafe_get_user(x, p, e) do { \ +#define arch_unsafe_get_user(x, p, e) do { \ __long_type(*(p)) __gu_val; \ __typeof__(*(p)) __user *__gu_addr = (p); \ \ @@ -459,7 +459,7 @@ user_write_access_begin(const void __user *ptr, size_t len) (x) = (__typeof__(*(p)))__gu_val; \ } while (0) -#define unsafe_put_user(x, p, e) \ +#define arch_unsafe_put_user(x, p, e) \ __put_user_size_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) #define unsafe_copy_from_user(d, s, l, e) \ @@ -504,11 +504,11 @@ do { \ unsafe_put_user(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), e); \ } while (0) -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ __get_user_size_goto(*((type *)(dst)), \ (__force type __user *)(src), sizeof(type), err_label) -#define __put_kernel_nofault(dst, src, type, err_label) \ +#define arch_put_kernel_nofault(dst, src, type, err_label) \ __put_user_size_goto(*((type *)(src)), \ (__force type __user *)(dst), sizeof(type), err_label) diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index b453e80dfc00..ec4458cdb97b 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -560,3 +560,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7b527d18aa5e..4c321a8ea896 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -423,7 +423,6 @@ config PPC_64S_HASH_MMU config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 - select ARCH_HAS_GIGANTIC_PAGE default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 7ec60290abe6..78c4b6ce5f13 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -267,22 +267,11 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, static int spufs_context_open(const struct path *path) { - int ret; - struct file *filp; - - ret = get_unused_fd_flags(0); - if (ret < 0) - return ret; - - filp = dentry_open(path, O_RDONLY, current_cred()); - if (IS_ERR(filp)) { - put_unused_fd(ret); - return PTR_ERR(filp); - } - - filp->f_op = &spufs_context_fops; - fd_install(ret, filp); - return ret; + FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred())); + if (fdf.err) + return fdf.err; + fd_prepare_file(fdf)->f_op = &spufs_context_fops; + return fd_publish(fdf); } static struct spu_context * @@ -508,26 +497,15 @@ static const struct file_operations spufs_gang_fops = { static int spufs_gang_open(const struct path *path) { - int ret; - struct file *filp; - - ret = get_unused_fd_flags(0); - if (ret < 0) - return ret; - /* * get references for dget and mntget, will be released * in error path of *_open(). */ - filp = dentry_open(path, O_RDONLY, current_cred()); - if (IS_ERR(filp)) { - put_unused_fd(ret); - return PTR_ERR(filp); - } - - filp->f_op = &spufs_gang_fops; - fd_install(ret, filp); - return ret; + FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred())); + if (fdf.err) + return fdf.err; + fd_prepare_file(fdf)->f_op = &spufs_gang_fops; + return fd_publish(fdf); } static int spufs_create_gang(struct inode *inode, diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 21a2f447c43f..dd7b668799d9 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -479,10 +479,7 @@ static const struct file_operations papr_hvpipe_handle_ops = { static int papr_hvpipe_dev_create_handle(u32 srcID) { - struct hvpipe_source_info *src_info; - struct file *file; - long err; - int fd; + struct hvpipe_source_info *src_info __free(kfree) = NULL; spin_lock(&hvpipe_src_list_lock); /* @@ -506,20 +503,13 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) src_info->tsk = current; init_waitqueue_head(&src_info->recv_wqh); - fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); - if (fd < 0) { - err = fd; - goto free_buf; - } - - file = anon_inode_getfile("[papr-hvpipe]", - &papr_hvpipe_handle_ops, (void *)src_info, - O_RDWR); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC, + anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops, + (void *)src_info, O_RDWR)); + if (fdf.err) + return fdf.err; + retain_and_null_ptr(src_info); spin_lock(&hvpipe_src_list_lock); /* * If two processes are executing ioctl() for the same @@ -528,22 +518,11 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) */ if (hvpipe_find_source(srcID)) { spin_unlock(&hvpipe_src_list_lock); - err = -EALREADY; - goto free_file; + return -EALREADY; } list_add(&src_info->list, &hvpipe_src_list); spin_unlock(&hvpipe_src_list_lock); - - fd_install(fd, file); - return fd; - -free_file: - fput(file); -free_fd: - put_unused_fd(fd); -free_buf: - kfree(src_info); - return err; + return fd_publish(fdf); } /* diff --git a/arch/powerpc/platforms/pseries/papr-platform-dump.c b/arch/powerpc/platforms/pseries/papr-platform-dump.c index f8d55eccdb6b..be633c9a0e75 100644 --- a/arch/powerpc/platforms/pseries/papr-platform-dump.c +++ b/arch/powerpc/platforms/pseries/papr-platform-dump.c @@ -303,8 +303,6 @@ static long papr_platform_dump_create_handle(u64 dump_tag) { struct ibm_platform_dump_params *params; u64 param_dump_tag; - struct file *file; - long err; int fd; /* @@ -334,34 +332,22 @@ static long papr_platform_dump_create_handle(u64 dump_tag) params->dump_tag_lo = (u32)(dump_tag & 0x00000000ffffffffULL); params->status = RTAS_IBM_PLATFORM_DUMP_START; - fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + fd = FD_ADD(O_RDONLY | O_CLOEXEC, + anon_inode_getfile_fmode("[papr-platform-dump]", + &papr_platform_dump_handle_ops, + (void *)params, O_RDONLY, + FMODE_LSEEK | FMODE_PREAD)); if (fd < 0) { - err = fd; - goto free_area; - } - - file = anon_inode_getfile_fmode("[papr-platform-dump]", - &papr_platform_dump_handle_ops, - (void *)params, O_RDONLY, - FMODE_LSEEK | FMODE_PREAD); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto put_fd; + rtas_work_area_free(params->work_area); + kfree(params); + return fd; } - fd_install(fd, file); - list_add(¶ms->list, &platform_dump_list); pr_info("%s (%d) initiated platform dump for dump tag %llu\n", current->comm, current->pid, dump_tag); return fd; -put_fd: - put_unused_fd(fd); -free_area: - rtas_work_area_free(params->work_area); - kfree(params); - return err; } /* diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.c b/arch/powerpc/platforms/pseries/papr-rtas-common.c index 33c606e3378a..1630e0cd210c 100644 --- a/arch/powerpc/platforms/pseries/papr-rtas-common.c +++ b/arch/powerpc/platforms/pseries/papr-rtas-common.c @@ -205,35 +205,18 @@ long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq, char *name) { const struct papr_rtas_blob *blob; - struct file *file; - long ret; int fd; blob = papr_rtas_retrieve(seq); if (IS_ERR(blob)) return PTR_ERR(blob); - fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); - if (fd < 0) { - ret = fd; - goto free_blob; - } - - file = anon_inode_getfile_fmode(name, fops, (void *)blob, - O_RDONLY, FMODE_LSEEK | FMODE_PREAD); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto put_fd; - } - - fd_install(fd, file); + fd = FD_ADD(O_RDONLY | O_CLOEXEC, + anon_inode_getfile_fmode(name, fops, (void *)blob, O_RDONLY, + FMODE_LSEEK | FMODE_PREAD)); + if (fd < 0) + papr_rtas_blob_free(blob); return fd; - -put_fd: - put_unused_fd(fd); -free_blob: - papr_rtas_blob_free(blob); - return ret; } /* diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 22cda9c452d2..fadec20b87a8 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -367,7 +367,7 @@ config RISCV_NONSTANDARD_CACHE_OPS systems to handle cache management. config AS_HAS_INSN - def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) + def_bool $(as-instr,.insn 0x100000f) config AS_HAS_OPTION_ARCH # https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4 diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index ecf2fcce2d92..4c6de57f65ef 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -134,21 +134,6 @@ endif CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS) # Default target when executing plain make -boot := arch/riscv/boot -ifeq ($(CONFIG_XIP_KERNEL),y) -KBUILD_IMAGE := $(boot)/xipImage -else -ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN_K210),yy) -KBUILD_IMAGE := $(boot)/loader.bin -else -ifeq ($(CONFIG_EFI_ZBOOT),) -KBUILD_IMAGE := $(boot)/Image.gz -else -KBUILD_IMAGE := $(boot)/vmlinuz.efi -endif -endif -endif - boot := arch/riscv/boot boot-image-y := Image boot-image-$(CONFIG_KERNEL_BZIP2) := Image.bz2 @@ -159,7 +144,7 @@ boot-image-$(CONFIG_KERNEL_LZO) := Image.lzo boot-image-$(CONFIG_KERNEL_ZSTD) := Image.zst boot-image-$(CONFIG_KERNEL_XZ) := Image.xz ifdef CONFIG_RISCV_M_MODE -boot-image-$(CONFIG_ARCH_CANAAN) := loader.bin +boot-image-$(CONFIG_SOC_CANAAN_K210) := loader.bin endif boot-image-$(CONFIG_EFI_ZBOOT) := vmlinuz.efi boot-image-$(CONFIG_XIP_KERNEL) := xipImage diff --git a/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi b/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi index 6367112e614a..a7442a508433 100644 --- a/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi +++ b/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi @@ -28,7 +28,7 @@ riscv,isa-base = "rv64i"; riscv,isa-extensions = "i", "m", "a", "f", "d", "c", "zicntr", "zicsr", "zifencei", "zihpm", "xtheadvector"; - thead,vlenb = <128>; + thead,vlenb = <16>; #cooling-cells = <2>; cpu0_intc: interrupt-controller { diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index ac28066bb564..e9e8ba83e632 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -12,6 +12,12 @@ #define __ASM_STR(x) #x #endif +#ifdef CONFIG_AS_HAS_INSN +#define ASM_INSN_I(__x) ".insn " __x +#else +#define ASM_INSN_I(__x) ".4byte " __x +#endif + #if __riscv_xlen == 64 #define __REG_SEL(a, b) __ASM_STR(a) #elif __riscv_xlen == 32 diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index 4c03e20ad11f..6f581b84d8fc 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -60,28 +60,28 @@ typedef u32 bug_insn_t; ".org 2b + " size "\n\t" \ ".popsection" \ -#define __BUG_FLAGS(flags) \ +#define __BUG_FLAGS(cond_str, flags) \ do { \ __asm__ __volatile__ ( \ ARCH_WARN_ASM("%0", "%1", "%2", "%3") \ : \ - : "i" (__FILE__), "i" (__LINE__), \ + : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) #else /* CONFIG_GENERIC_BUG */ -#define __BUG_FLAGS(flags) do { \ +#define __BUG_FLAGS(cond_str, flags) do { \ __asm__ __volatile__ ("ebreak\n"); \ } while (0) #endif /* CONFIG_GENERIC_BUG */ #define BUG() do { \ - __BUG_FLAGS(0); \ + __BUG_FLAGS("", 0); \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags)) #define ARCH_WARN_REACHABLE diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index c9cfcea52cbb..d29da6ccd3dd 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -256,10 +256,10 @@ INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \ SIMM12((offset) & 0xfe0), RS1(base)) -#define RISCV_PAUSE ".4byte 0x100000f" -#define ZAWRS_WRS_NTO ".4byte 0x00d00073" -#define ZAWRS_WRS_STO ".4byte 0x01d00073" -#define RISCV_NOP4 ".4byte 0x00000013" +#define RISCV_PAUSE ASM_INSN_I("0x100000f") +#define ZAWRS_WRS_NTO ASM_INSN_I("0x00d00073") +#define ZAWRS_WRS_STO ASM_INSN_I("0x01d00073") +#define RISCV_NOP4 ASM_INSN_I("0x00000013") #define RISCV_INSN_NOP4 _AC(0x00000013, U) diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index f5f4f7f85543..36bba6720c26 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -437,10 +437,10 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) __clear_user(untagged_addr(to), n) : n; } -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ __get_user_nocheck(*((type *)(dst)), (__force __user type *)(src), err_label) -#define __put_kernel_nofault(dst, src, type, err_label) \ +#define arch_put_kernel_nofault(dst, src, type, err_label) \ __put_user_nocheck(*((type *)(src)), (__force __user type *)(dst), err_label) static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) @@ -460,10 +460,10 @@ static inline void user_access_restore(unsigned long enabled) { } * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ -#define unsafe_put_user(x, ptr, label) \ +#define arch_unsafe_put_user(x, ptr, label) \ __put_user_nocheck(x, (ptr), label) -#define unsafe_get_user(x, ptr, label) do { \ +#define arch_unsafe_get_user(x, ptr, label) do { \ __inttype(*(ptr)) __gu_val; \ __get_user_nocheck(__gu_val, (ptr), label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ diff --git a/arch/riscv/include/asm/vendor_extensions/mips.h b/arch/riscv/include/asm/vendor_extensions/mips.h index ea8ca747d691..ffeb12dc17a3 100644 --- a/arch/riscv/include/asm/vendor_extensions/mips.h +++ b/arch/riscv/include/asm/vendor_extensions/mips.h @@ -30,8 +30,8 @@ extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_mips; * allowing any subsequent instructions to fetch. */ -#define MIPS_PAUSE ".4byte 0x00501013\n\t" -#define MIPS_EHB ".4byte 0x00301013\n\t" -#define MIPS_IHB ".4byte 0x00101013\n\t" +#define MIPS_PAUSE ASM_INSN_I("0x00501013\n\t") +#define MIPS_EHB ASM_INSN_I("0x00301013\n\t") +#define MIPS_IHB ASM_INSN_I("0x00101013\n\t") #endif // _ASM_RISCV_VENDOR_EXTENSIONS_MIPS_H diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index 3b09874d7a6d..7f5030ee1fcf 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -7,8 +7,8 @@ #define ANDES_VENDOR_ID 0x31e #define MICROCHIP_VENDOR_ID 0x029 +#define MIPS_VENDOR_ID 0x127 #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 -#define MIPS_VENDOR_ID 0x722 #endif diff --git a/arch/riscv/kernel/kgdb.c b/arch/riscv/kernel/kgdb.c index 9f3db3503dab..15fec5d1e6de 100644 --- a/arch/riscv/kernel/kgdb.c +++ b/arch/riscv/kernel/kgdb.c @@ -265,10 +265,10 @@ void kgdb_arch_handle_qxfer_pkt(char *remcom_in_buffer, { if (!strncmp(remcom_in_buffer, gdb_xfer_read_target, sizeof(gdb_xfer_read_target))) - strcpy(remcom_out_buffer, riscv_gdb_stub_target_desc); + strscpy(remcom_out_buffer, riscv_gdb_stub_target_desc, BUFMAX); else if (!strncmp(remcom_in_buffer, gdb_xfer_read_cpuxml, sizeof(gdb_xfer_read_cpuxml))) - strcpy(remcom_out_buffer, riscv_gdb_stub_cpuxml); + strscpy(remcom_out_buffer, riscv_gdb_stub_cpuxml, BUFMAX); } static inline void kgdb_arch_update_addr(struct pt_regs *regs, diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index 75551ac6504c..1675cbad8619 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -119,6 +119,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned int num_plts = 0; unsigned int num_gots = 0; Elf_Rela *scratch = NULL; + Elf_Rela *new_scratch; size_t scratch_size = 0; int i; @@ -168,9 +169,12 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); if (scratch_size_needed > scratch_size) { scratch_size = scratch_size_needed; - scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); - if (!scratch) + new_scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!new_scratch) { + kvfree(scratch); return -ENOMEM; + } + scratch = new_scratch; } for (size_t j = 0; j < num_relas; j++) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 5e8cde055264..c443337056ab 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -648,9 +648,9 @@ int sbi_debug_console_read(char *bytes, unsigned int num_bytes) void __init sbi_init(void) { + bool srst_power_off = false; int ret; - sbi_set_power_off(); ret = sbi_get_spec_version(); if (ret > 0) sbi_spec_version = ret; @@ -683,6 +683,7 @@ void __init sbi_init(void) sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); register_platform_power_off(sbi_srst_power_off); + srst_power_off = true; sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot; sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); @@ -702,4 +703,7 @@ void __init sbi_init(void) __sbi_send_ipi = __sbi_send_ipi_v01; __sbi_rfence = __sbi_rfence_v01; } + + if (!srst_power_off) + sbi_set_power_off(); } diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 3fe9e6edef8f..b41b6255751c 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -16,6 +16,22 @@ #ifdef CONFIG_FRAME_POINTER +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + unsigned long addr = x; \ + if ((task) == current) \ + val = READ_ONCE(addr); \ + else \ + val = READ_ONCE_NOCHECK(addr); \ + val; \ +}) + extern asmlinkage void handle_exception(void); extern unsigned long ret_from_exception_end; @@ -69,8 +85,9 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, fp = frame->ra; pc = regs->ra; } else { - fp = frame->fp; - pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra, + fp = READ_ONCE_TASK_STACK(task, frame->fp); + pc = READ_ONCE_TASK_STACK(task, frame->ra); + pc = ftrace_graph_ret_addr(current, &graph_idx, pc, &frame->ra); if (pc >= (unsigned long)handle_exception && pc < (unsigned long)&ret_from_exception_end) { diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug index 5db4df44279e..40f8dafffa0a 100644 --- a/arch/riscv/kernel/tests/Kconfig.debug +++ b/arch/riscv/kernel/tests/Kconfig.debug @@ -31,7 +31,7 @@ config RISCV_MODULE_LINKING_KUNIT If unsure, say N. config RISCV_KPROBES_KUNIT - bool "KUnit test for riscv kprobes" if !KUNIT_ALL_TESTS + tristate "KUnit test for riscv kprobes" if !KUNIT_ALL_TESTS depends on KUNIT depends on KPROBES default KUNIT_ALL_TESTS diff --git a/arch/riscv/kernel/tests/kprobes/Makefile b/arch/riscv/kernel/tests/kprobes/Makefile index 4cb6c66a98e8..df7256f62313 100644 --- a/arch/riscv/kernel/tests/kprobes/Makefile +++ b/arch/riscv/kernel/tests/kprobes/Makefile @@ -1 +1,3 @@ -obj-y += test-kprobes.o test-kprobes-asm.o +obj-$(CONFIG_RISCV_KPROBES_KUNIT) += kprobes_riscv_kunit.o + +kprobes_riscv_kunit-objs := test-kprobes.o test-kprobes-asm.o diff --git a/arch/riscv/kernel/tests/kprobes/test-kprobes.c b/arch/riscv/kernel/tests/kprobes/test-kprobes.c index 6f6cdfbf5a95..664535ca0a98 100644 --- a/arch/riscv/kernel/tests/kprobes/test-kprobes.c +++ b/arch/riscv/kernel/tests/kprobes/test-kprobes.c @@ -49,8 +49,11 @@ static struct kunit_case kprobes_testcases[] = { }; static struct kunit_suite kprobes_test_suite = { - .name = "kprobes_test_riscv", + .name = "kprobes_riscv", .test_cases = kprobes_testcases, }; kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit test for riscv kprobes"); diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index fda0346f0ea1..11422cb95a64 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -689,8 +689,20 @@ bool kvm_riscv_vcpu_aia_imsic_has_interrupt(struct kvm_vcpu *vcpu) */ read_lock_irqsave(&imsic->vsfile_lock, flags); - if (imsic->vsfile_cpu > -1) - ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei)); + if (imsic->vsfile_cpu > -1) { + /* + * This function is typically called from kvm_vcpu_block() via + * kvm_arch_vcpu_runnable() upon WFI trap. The kvm_vcpu_block() + * can be preempted and the blocking VCPU might resume on a + * different CPU. This means it is possible that current CPU + * does not match the imsic->vsfile_cpu hence this function + * must check imsic->vsfile_cpu before accessing HGEIP CSR. + */ + if (imsic->vsfile_cpu != vcpu->cpu) + ret = true; + else + ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei)); + } read_unlock_irqrestore(&imsic->vsfile_lock, flags); return ret; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 525fb5a330c0..58f5f3536ffd 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -171,7 +171,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { hva_t hva, reg_end, size; - gpa_t base_gpa; bool writable; int ret = 0; @@ -190,15 +189,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, hva = new->userspace_addr; size = new->npages << PAGE_SHIFT; reg_end = hva + size; - base_gpa = new->base_gfn << PAGE_SHIFT; writable = !(new->flags & KVM_MEM_READONLY); mmap_read_lock(current->mm); /* * A memory region could potentially cover multiple VMAs, and - * any holes between them, so iterate over all of them to find - * out if we can map any of them right now. + * any holes between them, so iterate over all of them. * * +--------------------------------------------+ * +---------------+----------------+ +----------------+ @@ -209,7 +206,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, */ do { struct vm_area_struct *vma; - hva_t vm_start, vm_end; + hva_t vm_end; vma = find_vma_intersection(current->mm, hva, reg_end); if (!vma) @@ -225,36 +222,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, } /* Take the intersection of this VMA with the memory region */ - vm_start = max(hva, vma->vm_start); vm_end = min(reg_end, vma->vm_end); if (vma->vm_flags & VM_PFNMAP) { - gpa_t gpa = base_gpa + (vm_start - hva); - phys_addr_t pa; - - pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; - pa += vm_start - vma->vm_start; - /* IO region dirty page logging not allowed */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { ret = -EINVAL; goto out; } - - ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start, - writable, false); - if (ret) - break; } hva = vm_end; } while (hva < reg_end); - if (change == KVM_MR_FLAGS_ONLY) - goto out; - - if (ret) - kvm_riscv_mmu_iounmap(kvm, base_gpa, size); - out: mmap_read_unlock(current->mm); return ret; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..5ce35aba6069 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -212,7 +212,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) && + return (kvm_riscv_vcpu_has_interrupts(vcpu, -1ULL) && !kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause); } diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 3b51690cc876..34299c2b231f 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -21,7 +21,7 @@ #define pt_dump_seq_puts(m, fmt) \ ({ \ if (m) \ - seq_printf(m, fmt); \ + seq_puts(m, fmt); \ }) /* diff --git a/arch/s390/Makefile b/arch/s390/Makefile index b4769241332b..8578361133a4 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -22,7 +22,7 @@ KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) endif -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 -fms-extensions KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR KBUILD_CFLAGS_DECOMPRESSOR += -Wno-pointer-sign @@ -35,6 +35,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-membe KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384)) diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index c500d45fb465..acb4b13d98c5 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -2,69 +2,55 @@ #ifndef _ASM_S390_BUG_H #define _ASM_S390_BUG_H -#include <linux/compiler.h> - -#ifdef CONFIG_BUG - -#ifdef CONFIG_DEBUG_BUGVERBOSE - -#define __EMIT_BUG(x) do { \ - asm_inline volatile( \ - "0: mc 0,0\n" \ - ".section .rodata.str,\"aMS\",@progbits,1\n" \ - "1: .asciz \""__FILE__"\"\n" \ - ".previous\n" \ - ".section __bug_table,\"aw\"\n" \ - "2: .long 0b-.\n" \ - " .long 1b-.\n" \ - " .short %0,%1\n" \ - " .org 2b+%2\n" \ - ".previous\n" \ - : : "i" (__LINE__), \ - "i" (x), \ - "i" (sizeof(struct bug_entry))); \ -} while (0) - -#else /* CONFIG_DEBUG_BUGVERBOSE */ - -#define __EMIT_BUG(x) do { \ - asm_inline volatile( \ - "0: mc 0,0\n" \ - ".section __bug_table,\"aw\"\n" \ - "1: .long 0b-.\n" \ - " .short %0\n" \ - " .org 1b+%1\n" \ - ".previous\n" \ - : : "i" (x), \ - "i" (sizeof(struct bug_entry))); \ +#include <linux/stringify.h> + +#ifndef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) +#else +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str, "aMS", @progbits, 1; \ + 10002: .ascii file "\0"; \ + .popsection; \ + \ + .long 10002b - .; \ + .short line; +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#endif + +#ifndef CONFIG_GENERIC_BUG +#define __BUG_ENTRY(cond_str, flags) +#else +#define __BUG_ENTRY(cond_str, flags) \ + .pushsection __bug_table, "aw"; \ + .align 4; \ + 10000: .long 10001f - .; \ + _BUGVERBOSE_LOCATION(WARN_CONDITION_STR(cond_str) __FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 10001: +#endif + +#define ASM_BUG_FLAGS(cond_str, flags) \ + __BUG_ENTRY(cond_str, flags) \ + mc 0,0 + +#define ASM_BUG() ASM_BUG_FLAGS("", 0) + +#define __BUG_FLAGS(cond_str, flags) \ + asm_inline volatile(__stringify(ASM_BUG_FLAGS(cond_str, flags))); + +#define __WARN_FLAGS(cond_str, flags) \ +do { \ + __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags)); \ } while (0) -#endif /* CONFIG_DEBUG_BUGVERBOSE */ - -#define BUG() do { \ - __EMIT_BUG(0); \ - unreachable(); \ +#define BUG() \ +do { \ + __BUG_FLAGS("", 0); \ + unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) do { \ - __EMIT_BUG(BUGFLAG_WARNING|(flags)); \ -} while (0) - -#define WARN_ON(x) ({ \ - int __ret_warn_on = !!(x); \ - if (__builtin_constant_p(__ret_warn_on)) { \ - if (__ret_warn_on) \ - __WARN(); \ - } else { \ - if (unlikely(__ret_warn_on)) \ - __WARN(); \ - } \ - unlikely(__ret_warn_on); \ -}) - #define HAVE_ARCH_BUG -#define HAVE_ARCH_WARN_ON -#endif /* CONFIG_BUG */ #include <asm-generic/bug.h> diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 6ce6b56e282b..46f92bb4c9e5 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -19,7 +19,7 @@ #ifdef CONFIG_EXPOLINE_EXTERN SYM_CODE_START(\name) #else - .pushsection .text.\name,"axG",@progbits,\name,comdat + .pushsection .text..\name,"axG",@progbits,\name,comdat .globl \name .hidden \name .type \name,@function diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b7100c6a4054..6663f1619abb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1154,17 +1154,15 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_NODAT 0x400 #define IPTE_GUEST_ASCE 0x800 -static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, - unsigned long opt, unsigned long asce, - int local) +static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, int local) { unsigned long pto; pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1); - asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]" + asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%%r0,%[m4]" : "+m" (*ptep) - : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt), - [asce] "a" (asce), [m4] "i" (local)); + : [r1] "a" (pto), [r2] "a" (addr & PAGE_MASK), + [m4] "i" (local)); } static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep, @@ -1348,7 +1346,7 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, * A local RDP can be used to do the flush. */ if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT)) - __ptep_rdp(address, ptep, 0, 0, 1); + __ptep_rdp(address, ptep, 1); } #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 3e5b8b677057..c5e02addcd67 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -468,8 +468,8 @@ do { \ #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT && CONFIG_CC_HAS_ASM_AOR_FORMAT_FLAGS */ -#define __get_kernel_nofault __mvc_kernel_nofault -#define __put_kernel_nofault __mvc_kernel_nofault +#define arch_get_kernel_nofault __mvc_kernel_nofault +#define arch_put_kernel_nofault __mvc_kernel_nofault void __cmpxchg_user_key_called_with_bad_pointer(void); diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 8a6744d658db..5863787ab036 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -472,3 +472,4 @@ 467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr sys_file_setattr +470 common listns sys_listns sys_listns diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index d74d4c52ccd0..8609126961dc 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -51,7 +51,7 @@ SECTIONS IRQENTRY_TEXT SOFTIRQENTRY_TEXT FTRACE_HOTPATCH_TRAMPOLINES_TEXT - *(.text.*_indirect_*) + *(.text..*_indirect_*) *(.gnu.warning) . = ALIGN(PAGE_SIZE); _etext = .; /* End of text section */ diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index e6175d75e4b0..2f829448c719 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -199,8 +199,7 @@ block: * return to userspace schedule() to block. */ __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); + set_need_resched_current(); } } out: diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0fde20bbc50b..05974304d622 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -274,9 +274,9 @@ void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, preempt_disable(); atomic_inc(&mm->context.flush_count); if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_rdp(addr, ptep, 0, 0, 1); + __ptep_rdp(addr, ptep, 1); else - __ptep_rdp(addr, ptep, 0, 0, 0); + __ptep_rdp(addr, ptep, 0); /* * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That * means it is still valid and active, and must not be changed according diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bd39b36e7bd6..0c196a5b194a 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -13,7 +13,7 @@ CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS := -std=gnu11 -fms-extensions -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common @@ -21,6 +21,7 @@ KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) +KBUILD_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) KBUILD_AFLAGS += -D__DISABLE_EXPORTS diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h index 05a485c4fabc..891276687355 100644 --- a/arch/sh/include/asm/bug.h +++ b/arch/sh/include/asm/bug.h @@ -52,14 +52,14 @@ do { \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ __asm__ __volatile__ ( \ "1:\t.short %O0\n" \ _EMIT_BUG_ENTRY \ : \ : "n" (TRAPA_BUG_OPCODE), \ - "i" (__FILE__), \ + "i" (WARN_CONDITION_STR(cond_str) __FILE__), \ "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry))); \ diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 5e9c9eff5539..969c11325ade 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -473,3 +473,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index ebb7d06d1044..39aa26b6a50b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -515,3 +515,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fa3b616af03a..34fb46d5341b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -261,6 +261,7 @@ config X86 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_RETHOOK + select HAVE_KLP_BUILD if X86_64 select HAVE_LIVEPATCH if X86_64 select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC @@ -297,6 +298,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_UNWIND_USER_FP if X86_64 select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO select VDSO_GETRANDOM if X86_64 @@ -379,7 +381,7 @@ config GENERIC_CSUM config GENERIC_BUG def_bool y depends on BUG - select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + select GENERIC_BUG_RELATIVE_POINTERS config GENERIC_BUG_RELATIVE_POINTERS bool diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8fbff3106c56..1d403a3612ea 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -48,7 +48,8 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ +REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \ + -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) @@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4) REALMODE_CFLAGS += $(CLANG_FLAGS) ifdef CONFIG_CC_IS_CLANG REALMODE_CFLAGS += -Wno-gnu +REALMODE_CFLAGS += -Wno-microsoft-anon-tag endif export REALMODE_CFLAGS @@ -98,7 +100,7 @@ ifeq ($(CONFIG_X86_KERNEL_IBT),y) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104816 # KBUILD_CFLAGS += $(call cc-option,-fcf-protection=branch -fno-jump-tables) -KBUILD_RUSTFLAGS += -Zcf-protection=branch -Zno-jump-tables +KBUILD_RUSTFLAGS += -Zcf-protection=branch $(if $(call rustc-min-version,109300),-Cjump-tables=n,-Zno-jump-tables) else KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none) endif diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index a2b6b428922a..bda042933a05 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -135,29 +135,29 @@ int enable_a20(void) (legacy free, etc.) */ if (a20_test_short()) return 0; - + /* Next, try the BIOS (INT 0x15, AX=0x2401) */ enable_a20_bios(); if (a20_test_short()) return 0; - + /* Try enabling A20 through the keyboard controller */ kbc_err = empty_8042(); if (a20_test_short()) return 0; /* BIOS worked, but with delayed reaction */ - + if (!kbc_err) { enable_a20_kbc(); if (a20_test_long()) return 0; } - + /* Finally, try enabling the "fast A20 gate" */ enable_a20_fast(); if (a20_test_long()) return 0; } - + return -1; } diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index a3c58ebe3662..8e3eab34dff4 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -193,8 +193,6 @@ static inline bool heap_free(size_t n) void copy_to_fs(addr_t dst, void *src, size_t len); void *copy_from_fs(void *dst, addr_t src, size_t len); -void copy_to_gs(addr_t dst, void *src, size_t len); -void *copy_from_gs(void *dst, addr_t src, size_t len); /* a20.c */ int enable_a20(void); diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 74657589264d..68f9d7a1683b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) -KBUILD_CFLAGS += -std=gnu11 +KBUILD_CFLAGS += -std=gnu11 -fms-extensions KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag +endif KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index db1048621ea2..fd855e32c9b9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -152,17 +152,6 @@ bool insn_has_rep_prefix(struct insn *insn); void sev_insn_decode_init(void); bool early_setup_ghcb(void); #else -static inline void sev_enable(struct boot_params *bp) -{ - /* - * bp->cc_blob_address should only be set by boot/compressed kernel. - * Initialize it to 0 unconditionally (thus here in this stub too) to - * ensure that uninitialized values from buggy bootloaders aren't - * propagated. - */ - if (bp) - bp->cc_blob_address = 0; -} static inline void snp_check_features(void) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index bdd26050dff7..0e89e197e112 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -3,6 +3,7 @@ #include <asm/bootparam.h> #include <asm/bootparam_utils.h> #include <asm/e820/types.h> +#include <asm/pgtable.h> #include <asm/processor.h> #include "../string.h" #include "efi.h" @@ -168,9 +169,10 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. */ - *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC; + *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC; } else { - unsigned long src; + u64 *new_cr3; + pgd_t *pgdp; /* * For 5- to 4-level paging transition, copy page table pointed @@ -180,8 +182,9 @@ asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) * We cannot just point to the page table from trampoline as it * may be above 4G. */ - src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; - memcpy(trampoline_32bit, (void *)src, PAGE_SIZE); + pgdp = (pgd_t *)native_read_cr3_pa(); + new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK); + memcpy(trampoline_32bit, new_cr3, PAGE_SIZE); } toggle_la57(trampoline_32bit); diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 7530ad8b768b..030001b46554 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -29,11 +29,10 @@ bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 6e5c32a53d03..c8c1464b3a56 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -14,6 +14,7 @@ #include <asm/bootparam.h> #include <asm/pgtable_types.h> +#include <asm/shared/msr.h> #include <asm/sev.h> #include <asm/trapnr.h> #include <asm/trap_pf.h> @@ -397,7 +398,7 @@ void sev_enable(struct boot_params *bp) } /* Set the SME mask if this is an SEV guest. */ - boot_rdmsr(MSR_AMD64_SEV, &m); + raw_rdmsr(MSR_AMD64_SEV, &m); sev_status = m.q; if (!(sev_status & MSR_AMD64_SEV_ENABLED)) return; @@ -446,7 +447,7 @@ u64 sev_get_status(void) if (sev_check_cpu_support() < 0) return 0; - boot_rdmsr(MSR_AMD64_SEV, &m); + raw_rdmsr(MSR_AMD64_SEV, &m); return m.q; } @@ -496,7 +497,7 @@ bool early_is_sevsnp_guest(void) struct msr m; /* Obtain the address of the calling area to use */ - boot_rdmsr(MSR_SVSM_CAA, &m); + raw_rdmsr(MSR_SVSM_CAA, &m); boot_svsm_caa_pa = m.q; /* diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index 92f79c21939c..22637b416b46 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -10,7 +10,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT -#include "../msr.h" +#include <asm/shared/msr.h> void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 sev_get_status(void); @@ -20,7 +20,7 @@ static inline u64 sev_es_rd_ghcb_msr(void) { struct msr m; - boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); return m.q; } @@ -30,7 +30,7 @@ static inline void sev_es_wr_ghcb_msr(u64 val) struct msr m; m.q = val; - boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); + raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); } #else diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index f82de8de5dc6..2e1bb936cba2 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -26,9 +26,9 @@ #include <asm/intel-family.h> #include <asm/processor-flags.h> #include <asm/msr-index.h> +#include <asm/shared/msr.h> #include "string.h" -#include "msr.h" static u32 err_flags[NCAPINTS]; @@ -134,9 +134,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_K7_HWCR, &m); + raw_rdmsr(MSR_K7_HWCR, &m); m.l &= ~(1 << 15); - boot_wrmsr(MSR_K7_HWCR, &m); + raw_wrmsr(MSR_K7_HWCR, &m); get_cpuflags(); /* Make sure it really did something */ err = check_cpuflags(); @@ -148,9 +148,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_VIA_FCR, &m); + raw_rdmsr(MSR_VIA_FCR, &m); m.l |= (1 << 1) | (1 << 7); - boot_wrmsr(MSR_VIA_FCR, &m); + raw_wrmsr(MSR_VIA_FCR, &m); set_bit(X86_FEATURE_CX8, cpu.flags); err = check_cpuflags(); @@ -160,14 +160,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m, m_tmp; u32 level = 1; - boot_rdmsr(0x80860004, &m); + raw_rdmsr(0x80860004, &m); m_tmp = m; m_tmp.l = ~0; - boot_wrmsr(0x80860004, &m_tmp); + raw_wrmsr(0x80860004, &m_tmp); asm("cpuid" : "+a" (level), "=d" (cpu.flags[0]) : : "ecx", "ebx"); - boot_wrmsr(0x80860004, &m); + raw_wrmsr(0x80860004, &m); err = check_cpuflags(); } else if (err == 0x01 && diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h deleted file mode 100644 index aed66f7ae199..000000000000 --- a/arch/x86/boot/msr.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Helpers/definitions related to MSR access. - */ - -#ifndef BOOT_MSR_H -#define BOOT_MSR_H - -#include <asm/shared/msr.h> - -/* - * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the - * boot kernel since they rely on tracepoint/exception handling infrastructure - * that's not available here. - */ -static inline void boot_rdmsr(unsigned int reg, struct msr *m) -{ - asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); -} - -static inline void boot_wrmsr(unsigned int reg, const struct msr *m) -{ - asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); -} - -#endif /* BOOT_MSR_H */ diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile index e8fdf020b422..5e499cfb29b5 100644 --- a/arch/x86/boot/startup/Makefile +++ b/arch/x86/boot/startup/Makefile @@ -36,7 +36,7 @@ $(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y # relocations, even if other objtool actions are being deferred. # $(pi-objs): objtool-enabled = 1 -$(pi-objs): objtool-args = $(if $(delay-objtool),,$(objtool-args-y)) --noabs +$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs # # Confine the startup code by prefixing all symbols with __pi_ (for position diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 4e22ffd73516..a0fa8bb2b945 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -12,7 +12,7 @@ #include <asm/setup_data.h> #ifndef __BOOT_COMPRESSED -#define has_cpuflag(f) boot_cpu_has(f) +#define has_cpuflag(f) cpu_feature_enabled(f) #else #undef WARN #define WARN(condition, format...) (!!(condition)) diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index 7fc136a35334..f08c7505ed82 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -352,7 +352,6 @@ fault: #define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) #define error(v) -#define has_cpuflag(f) boot_cpu_has(f) #include "vc-shared.c" diff --git a/arch/x86/coco/sev/vc-shared.c b/arch/x86/coco/sev/vc-shared.c index 9b01c9ad81be..58b2f985d546 100644 --- a/arch/x86/coco/sev/vc-shared.c +++ b/arch/x86/coco/sev/vc-shared.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +#ifndef __BOOT_COMPRESSED +#define has_cpuflag(f) cpu_feature_enabled(f) +#endif + static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, unsigned long exit_code) { @@ -546,6 +550,13 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); + if (has_cpuflag(X86_FEATURE_SHSTK) && regs->ax == 0xd && regs->cx == 1) { + struct msr m; + + raw_rdmsr(MSR_IA32_XSS, &m); + ghcb_set_xss(ghcb, m.q); + } + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 1d723c5ae9dd..6ba2b3adcef0 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -32,6 +32,14 @@ SYM_FUNC_START(write_ibpb) SYM_FUNC_END(write_ibpb) EXPORT_SYMBOL_FOR_KVM(write_ibpb); +SYM_FUNC_START(__WARN_trap) + ANNOTATE_NOENDBR + ANNOTATE_REACHABLE + ud1 (%edx), %_ASM_ARG1 + RET +SYM_FUNC_END(__WARN_trap) +EXPORT_SYMBOL(__WARN_trap) + .popsection /* diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 2b15ea17bb7c..a67a644d0cfe 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -274,9 +274,10 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) * fetch EBP before invoking any of the syscall entry work * functions. */ - syscall_enter_from_user_mode_prepare(regs); + enter_from_user_mode(regs); instrumentation_begin(); + local_irq_enable(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4877e16da69a..e979a3eac7a3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -475,3 +475,4 @@ 467 i386 open_tree_attr sys_open_tree_attr 468 i386 file_getattr sys_file_getattr 469 i386 file_setattr sys_file_setattr +470 i386 listns sys_listns diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ced2a1deecd7..8a4ac4841be6 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -394,6 +394,7 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2dd9afb8dd9d..44656d2fb555 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -764,7 +764,12 @@ static void amd_pmu_enable_all(int added) if (!test_bit(idx, cpuc->active_mask)) continue; - amd_pmu_enable_event(cpuc->events[idx]); + /* + * FIXME: cpuc->events[idx] can become NULL in a subtle race + * condition with NMI->throttle->x86_pmu_stop(). + */ + if (cpuc->events[idx]) + amd_pmu_enable_event(cpuc->events[idx]); } } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b5e397fa0835..0c38a31d5fc7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -555,14 +555,22 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_max_precise(void) +int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { - precise++; + /* arch PEBS */ + if (x86_pmu.arch_pebs) { + precise = 2; + if (hybrid(pmu, arch_pebs_cap).pdists) + precise++; + + return precise; + } + /* legacy PEBS - support for constant skid */ + precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; @@ -570,13 +578,14 @@ int x86_pmu_max_precise(void) if (x86_pmu.pebs_prec_dist) precise++; } + return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { - int precise = x86_pmu_max_precise(); + int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1345,6 +1354,7 @@ static void x86_pmu_enable(struct pmu *pmu) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[hwc->idx] = NULL; } /* @@ -1366,6 +1376,7 @@ static void x86_pmu_enable(struct pmu *pmu) * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ + cpuc->events[hwc->idx] = event; x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1532,7 +1543,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) event->hw.state = 0; - cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); @@ -1611,7 +1621,6 @@ void x86_pmu_stop(struct perf_event *event, int flags) if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -1649,6 +1658,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) @@ -2630,7 +2640,9 @@ static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); + struct pmu *pmu = dev_get_drvdata(cdev); + + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); @@ -2790,13 +2802,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_callchain_store(entry, regs->ip)) - return; - - if (perf_hw_regs(regs)) + if (perf_hw_regs(regs)) { + if (perf_callchain_store(entry, regs->ip)) + return; unwind_start(&state, current, regs, NULL); - else + } else { unwind_start(&state, current, NULL, (void *)regs->sp); + } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); @@ -2846,46 +2858,6 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_UPROBES -/* - * Heuristic-based check if uprobe is installed at the function entry. - * - * Under assumption of user code being compiled with frame pointers, - * `push %rbp/%ebp` is a good indicator that we indeed are. - * - * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. - * If we get this wrong, captured stack trace might have one extra bogus - * entry, but the rest of stack trace will still be meaningful. - */ -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - struct arch_uprobe *auprobe; - - if (!current->utask) - return false; - - auprobe = current->utask->auprobe; - if (!auprobe) - return false; - - /* push %rbp/%ebp */ - if (auprobe->insn[0] == 0x55) - return true; - - /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) - return true; - - return false; -} - -#else -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_UPROBES */ - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe65be0b9d9c..853fe073bab3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2563,6 +2563,44 @@ static void intel_pmu_disable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val &= ~mask; } +static inline void __intel_pmu_update_event_ext(int idx, u64 ext) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u32 msr; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr = MSR_IA32_PMC_V6_GP0_CFG_C + + x86_pmu.addr_offset(idx, false); + } else { + msr = MSR_IA32_PMC_V6_FX0_CFG_C + + x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); + } + + cpuc->cfg_c_val[idx] = ext; + wrmsrq(msr, ext); +} + +static void intel_pmu_disable_event_ext(struct perf_event *event) +{ + /* + * Only clear CFG_C MSR for PEBS counter group events, + * it avoids the HW counter's value to be added into + * other PEBS records incorrectly after PEBS counter + * group events are disabled. + * + * For other events, it's unnecessary to clear CFG_C MSRs + * since CFG_C doesn't take effect if counter is in + * disabled state. That helps to reduce the WRMSR overhead + * in context switches. + */ + if (!is_pebs_counter_event_group(event)) + return; + + __intel_pmu_update_event_ext(event->hw.idx, 0); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext); + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -2571,9 +2609,12 @@ static void intel_pmu_disable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); + static_call_cond(intel_pmu_disable_event_ext)(event); x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_disable_event_ext)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); break; @@ -2940,6 +2981,79 @@ static void intel_pmu_enable_acr(struct perf_event *event) DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); +static void intel_pmu_enable_event_ext(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + union arch_pebs_index old, new; + struct arch_pebs_cap cap; + u64 ext = 0; + + cap = hybrid(cpuc->pmu, arch_pebs_cap); + + if (event->attr.precise_ip) { + u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event); + + ext |= ARCH_PEBS_EN; + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) + ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD; + + if (pebs_data_cfg && cap.caps) { + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + ext |= ARCH_PEBS_AUX & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_GP) + ext |= ARCH_PEBS_GPR & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + ext |= ARCH_PEBS_VECR_XMM & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + ext |= ARCH_PEBS_LBR & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT)) + ext |= ARCH_PEBS_CNTR_GP & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT)) + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps; + } + + if (cpuc->n_pebs == cpuc->n_large_pebs) + new.thresh = ARCH_PEBS_THRESH_MULTI; + else + new.thresh = ARCH_PEBS_THRESH_SINGLE; + + rdmsrq(MSR_IA32_PEBS_INDEX, old.whole); + if (new.thresh != old.thresh || !old.en) { + if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) { + /* + * Large PEBS was enabled. + * Drain PEBS buffer before applying the single PEBS. + */ + intel_pmu_drain_pebs_buffer(); + } else { + new.wr = 0; + new.full = 0; + new.en = 1; + wrmsrq(MSR_IA32_PEBS_INDEX, new.whole); + } + } + } + + if (is_pebs_counter_event_group(event)) + ext |= ARCH_PEBS_CNTR_ALLOW; + + if (cpuc->cfg_c_val[hwc->idx] != ext) + __intel_pmu_update_event_ext(hwc->idx, ext); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2955,10 +3069,12 @@ static void intel_pmu_enable_event(struct perf_event *event) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); @@ -3216,6 +3332,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* + * Arch PEBS sets bit 54 in the global status register + */ + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT, + (unsigned long *)&status)) { + handled++; + static_call(x86_pmu_drain_pebs)(regs, &data); + + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; + } + + /* * Intel PT */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { @@ -3269,7 +3398,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * The PEBS buffer has to be drained before handling the A-PMI */ if (is_pebs_counter_event_group(event)) - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); last_period = event->hw.last_period; @@ -4029,7 +4158,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; if (event->attr.sample_regs_user & ~PEBS_GP_REGS) - flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_intr & ~PEBS_GP_REGS) + flags &= ~PERF_SAMPLE_REGS_INTR; return flags; } @@ -4204,6 +4335,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event) return false; } +static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu) +{ + u64 caps; + + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline) + return true; + + caps = hybrid(pmu, arch_pebs_cap).caps; + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK)) + return true; + + return false; +} + static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, u64 *cause_mask, int *num) { @@ -4237,6 +4382,8 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (event->attr.precise_ip) { + struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap); + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; @@ -4250,6 +4397,15 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (x86_pmu.arch_pebs) { + u64 cntr_mask = hybrid(event->pmu, intel_ctrl) & + ~GLOBAL_CTRL_EN_PERF_METRICS; + u64 pebs_mask = event->attr.precise_ip >= 3 ? + pebs_cap.pdists : pebs_cap.counters; + if (cntr_mask != pebs_mask) + event->hw.dyn_constraint &= pebs_mask; + } } if (needs_branch_stack(event)) { @@ -4341,8 +4497,7 @@ static int intel_pmu_hw_config(struct perf_event *event) } if ((event->attr.sample_type & PERF_SAMPLE_READ) && - (x86_pmu.intel_cap.pebs_format >= 6) && - x86_pmu.intel_cap.pebs_baseline && + intel_pmu_has_pebs_counter_group(event->pmu) && is_sampling_event(event) && event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; @@ -5212,7 +5367,13 @@ err: static int intel_pmu_cpu_prepare(int cpu) { - return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + int ret; + + ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + if (ret) + return ret; + + return alloc_arch_pebs_buf_on_cpu(cpu); } static void flip_smm_bit(void *data) @@ -5257,6 +5418,163 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con u64 fixed_cntr_mask, u64 intel_ctrl); +enum dyn_constr_type { + DYN_CONSTR_NONE, + DYN_CONSTR_BR_CNTR, + DYN_CONSTR_ACR_CNTR, + DYN_CONSTR_ACR_CAUSE, + DYN_CONSTR_PEBS, + DYN_CONSTR_PDIST, + + DYN_CONSTR_MAX, +}; + +static const char * const dyn_constr_type_name[] = { + [DYN_CONSTR_NONE] = "a normal event", + [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", + [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", + [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", + [DYN_CONSTR_PEBS] = "a PEBS event", + [DYN_CONSTR_PDIST] = "a PEBS PDIST event", +}; + +static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, + enum dyn_constr_type type, u64 mask) +{ + struct event_constraint *c1, *c2; + int new_weight, check_weight; + u64 new_mask, check_mask; + + for_each_event_constraint(c1, constr) { + new_mask = c1->idxmsk64 & mask; + new_weight = hweight64(new_mask); + + /* ignore topdown perf metrics event */ + if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) + continue; + + if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) { + pr_info("The event 0x%llx is not supported as %s.\n", + c1->code, dyn_constr_type_name[type]); + } + + if (new_weight <= 1) + continue; + + for_each_event_constraint(c2, c1 + 1) { + bool check_fail = false; + + check_mask = c2->idxmsk64 & mask; + check_weight = hweight64(check_mask); + + if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN || + !check_weight) + continue; + + /* The same constraints or no overlap */ + if (new_mask == check_mask || + (new_mask ^ check_mask) == (new_mask | check_mask)) + continue; + + /* + * A scheduler issue may be triggered in the following cases. + * - Two overlap constraints have the same weight. + * E.g., A constraints: 0x3, B constraints: 0x6 + * event counter failure case + * B PMC[2:1] 1 + * A PMC[1:0] 0 + * A PMC[1:0] FAIL + * - Two overlap constraints have different weight. + * The constraint has a low weight, but has high last bit. + * E.g., A constraints: 0x7, B constraints: 0xC + * event counter failure case + * B PMC[3:2] 2 + * A PMC[2:0] 0 + * A PMC[2:0] 1 + * A PMC[2:0] FAIL + */ + if (new_weight == check_weight) { + check_fail = true; + } else if (new_weight < check_weight) { + if ((new_mask | check_mask) != check_mask && + fls64(new_mask) > fls64(check_mask)) + check_fail = true; + } else { + if ((new_mask | check_mask) != new_mask && + fls64(new_mask) < fls64(check_mask)) + check_fail = true; + } + + if (check_fail) { + pr_info("The two events 0x%llx and 0x%llx may not be " + "fully scheduled under some circumstances as " + "%s.\n", + c1->code, c2->code, dyn_constr_type_name[type]); + } + } + } +} + +static void intel_pmu_check_dyn_constr(struct pmu *pmu, + struct event_constraint *constr, + u64 cntr_mask) +{ + enum dyn_constr_type i; + u64 mask; + + for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) { + mask = 0; + switch (i) { + case DYN_CONSTR_NONE: + mask = cntr_mask; + break; + case DYN_CONSTR_BR_CNTR: + if (x86_pmu.flags & PMU_FL_BR_CNTR) + mask = x86_pmu.lbr_counters; + break; + case DYN_CONSTR_ACR_CNTR: + mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_ACR_CAUSE: + if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64)) + continue; + mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_PEBS: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).counters; + break; + case DYN_CONSTR_PDIST: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).pdists; + break; + default: + pr_warn("Unsupported dynamic constraint type %d\n", i); + } + + if (mask) + __intel_pmu_check_dyn_constr(constr, i, mask); + } +} + +static void intel_pmu_check_event_constraints_all(struct pmu *pmu) +{ + struct event_constraint *event_constraints = hybrid(pmu, event_constraints); + struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints); + u64 cntr_mask = hybrid(pmu, cntr_mask64); + u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64); + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + intel_pmu_check_event_constraints(event_constraints, cntr_mask, + fixed_cntr_mask, intel_ctrl); + + if (event_constraints) + intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask); + + if (pebs_constraints) + intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask); +} + static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); static inline bool intel_pmu_broken_perf_cap(void) @@ -5269,34 +5587,89 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } +static inline void __intel_update_pmu_caps(struct pmu *pmu) +{ + struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id()); + + if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM) + dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; +} + +static inline void __intel_update_large_pebs_flags(struct pmu *pmu) +{ + u64 caps = hybrid(pmu, arch_pebs_cap).caps; + + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; + if (caps & ARCH_PEBS_LBR) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; + if (caps & ARCH_PEBS_CNTR_MASK) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + + if (!(caps & ARCH_PEBS_AUX)) + x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; + if (!(caps & ARCH_PEBS_GPR)) { + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER); + } +} + +#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) + static void update_pmu_cap(struct pmu *pmu) { - unsigned int cntr, fixed_cntr, ecx, edx; - union cpuid35_eax eax; - union cpuid35_ebx ebx; + unsigned int eax, ebx, ecx, edx; + union cpuid35_eax eax_0; + union cpuid35_ebx ebx_0; + u64 cntrs_mask = 0; + u64 pebs_mask = 0; + u64 pdists_mask = 0; - cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx); - if (ebx.split.umask2) + if (ebx_0.split.umask2) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx.split.eq) + if (ebx_0.split.eq) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; - if (eax.split.cntr_subleaf) { + if (eax_0.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); - hybrid(pmu, cntr_mask64) = cntr; - hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + &eax, &ebx, &ecx, &edx); + hybrid(pmu, cntr_mask64) = eax; + hybrid(pmu, fixed_cntr_mask64) = ebx; + cntrs_mask = counter_mask(eax, ebx); } - if (eax.split.acr_subleaf) { + if (eax_0.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); + &eax, &ebx, &ecx, &edx); /* The mask of the counters which can be reloaded */ - hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); - + hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx); /* The mask of the counters which can cause a reload of reloadable counters */ - hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx); + } + + /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */ + if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF, + &eax, &ebx, &ecx, &edx); + hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32; + + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF, + &eax, &ebx, &ecx, &edx); + pebs_mask = counter_mask(eax, ecx); + pdists_mask = counter_mask(ebx, edx); + hybrid(pmu, arch_pebs_cap).counters = pebs_mask; + hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; + + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) { + x86_pmu.arch_pebs = 0; + } else { + __intel_update_pmu_caps(pmu); + __intel_update_large_pebs_flags(pmu); + } + } else { + WARN_ON(x86_pmu.arch_pebs == 1); + x86_pmu.arch_pebs = 0; } if (!intel_pmu_broken_perf_cap()) { @@ -5319,10 +5692,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->cntr_mask64, - pmu->fixed_cntr_mask64, - pmu->intel_ctrl); + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); } @@ -5418,6 +5788,7 @@ static void intel_pmu_cpu_starting(int cpu) return; init_debug_store_on_cpu(cpu); + init_arch_pebs_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up, and that may * even boot with LBRs enabled. @@ -5456,6 +5827,8 @@ static void intel_pmu_cpu_starting(int cpu) } } + __intel_update_pmu_caps(cpuc->pmu); + if (!cpuc->shared_regs) return; @@ -5515,6 +5888,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); + fini_arch_pebs_on_cpu(cpu); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -5535,6 +5909,7 @@ static void intel_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + release_arch_pebs_buf_on_cpu(cpu); intel_cpuc_finish(cpuc); if (is_hybrid() && cpuc->pmu) @@ -6250,7 +6625,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.ds_pebs ? attr->mode : 0; + return intel_pmu_has_pebs() ? attr->mode : 0; } static umode_t @@ -6940,8 +7315,11 @@ __init int intel_pmu_init(void) * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. */ - if (version >= 6) + if (version >= 6) { x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; + x86_pmu.late_setup = intel_pmu_late_setup; + } + /* * Install the hw-cache-events table: */ @@ -7727,6 +8105,14 @@ __init int intel_pmu_init(void) if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(NULL); + if (x86_pmu.arch_pebs) { + static_call_update(intel_pmu_disable_event_ext, + intel_pmu_disable_event_ext); + static_call_update(intel_pmu_enable_event_ext, + intel_pmu_enable_event_ext); + pr_cont("Architectural PEBS, "); + } + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); @@ -7735,10 +8121,8 @@ __init int intel_pmu_init(void) if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.cntr_mask64, - x86_pmu.fixed_cntr_mask64, - x86_pmu.intel_ctrl); + intel_pmu_check_event_constraints_all(NULL); + /* * Access LBR MSR may cause #GP under certain circumstances. * Check all LBR MSR here. diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index ec753e39b007..fa67fda6e45b 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL + * MTL,SRF,GRR,ARL,LNL,PTL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,31 +53,32 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL + * GRR,ARL,LNL,PTL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL + * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, + * PTL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF + * RPL,SPR,MTL,ARL,LNL,SRF,PTL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, - * ADL,RPL,MTL,ARL,LNL + * ADL,RPL,MTL,ARL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL + * ARL,LNL,PTL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -96,7 +97,7 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 @@ -522,7 +523,6 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_CORE_C7_RES), .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | - BIT(PERF_CSTATE_PKG_C3_RES) | BIT(PERF_CSTATE_PKG_C6_RES) | BIT(PERF_CSTATE_PKG_C10_RES), }; @@ -628,6 +628,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates), @@ -652,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 01bc59e9286c..feb1c3cf63e4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -626,13 +626,18 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; + if (x86_pmu.arch_pebs) { + hwev->pebs_vaddr = buffer; + return 0; + } + /* * HSW+ already provides us the eventing ip; no need to allocate this * buffer then. @@ -645,7 +650,7 @@ static int alloc_pebs_buffer(int cpu) } per_cpu(insn_buffer, cpu) = insn_buff; } - hwev->ds_pebs_vaddr = buffer; + hwev->pebs_vaddr = buffer; /* Update the cpu entry area mapping */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds->pebs_buffer_base = (unsigned long) cea; @@ -661,17 +666,20 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return; - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; + if (x86_pmu.ds_pebs) { + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; - /* Clear the fixmap */ - cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; - ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); - hwev->ds_pebs_vaddr = NULL; + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); + } + + dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) @@ -824,6 +832,56 @@ void reserve_ds_buffers(void) } } +inline int alloc_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return 0; + + return alloc_pebs_buffer(cpu); +} + +inline void release_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + release_pebs_buffer(cpu); +} + +void init_arch_pebs_on_cpu(int cpu) +{ + struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + u64 arch_pebs_base; + + if (!x86_pmu.arch_pebs) + return; + + if (!cpuc->pebs_vaddr) { + WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu); + x86_pmu.pebs_active = 0; + return; + } + + /* + * 4KB-aligned pointer of the output buffer + * (__alloc_pages_node() return page aligned address) + * Buffer Size = 4KB * 2^SIZE + * contiguous physical buffer (__alloc_pages_node() with order) + */ + arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT; + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base, + (u32)(arch_pebs_base >> 32)); + x86_pmu.pebs_active = 1; +} + +inline void fini_arch_pebs_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0); +} + /* * BTS */ @@ -1471,6 +1529,25 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, } } +u64 intel_get_arch_pebs_data_config(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = 0; + u64 cntr_mask; + + if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) + return 0; + + pebs_data_cfg |= pebs_update_adaptive_cfg(event); + + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) | + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) | + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS; + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask; + + return pebs_data_cfg; +} + void intel_pmu_pebs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1532,6 +1609,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) intel_pmu_drain_pebs_buffer(); } +static void __intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + cpuc->pebs_enabled |= 1ULL << hwc->idx; +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1540,9 +1626,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct debug_store *ds = cpuc->ds; unsigned int idx = hwc->idx; - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - - cpuc->pebs_enabled |= 1ULL << hwc->idx; + __intel_pmu_pebs_enable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); @@ -1604,14 +1688,22 @@ void intel_pmu_pebs_del(struct perf_event *event) pebs_update_state(needed_cb, cpuc, event, false); } -void intel_pmu_pebs_disable(struct perf_event *event) +static void __intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; intel_pmu_drain_large_pebs(cpuc); - cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + __intel_pmu_pebs_disable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) @@ -1623,8 +1715,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) if (cpuc->enabled) wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); - - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; } void intel_pmu_pebs_enable_all(void) @@ -2060,6 +2150,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, #define PEBS_LATENCY_MASK 0xffff +static inline void __setup_perf_sample_data(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data) +{ + perf_sample_data_init(data, 0, event->hw.last_period); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + perf_sample_save_callchain(data, event, iregs); +} + +static inline void __setup_pebs_basic_group(struct perf_event *event, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 sample_type, u64 ip, + u64 tsc, u16 retire) +{ + /* The ip in basic is EventingIP */ + set_linear_ip(regs, ip); + regs->flags = PERF_EFLAGS_EXACT; + setup_pebs_time(event, data, tsc); + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + data->weight.var3_w = retire; +} + +static inline void __setup_pebs_gpr_group(struct perf_event *event, + struct pt_regs *regs, + struct pebs_gprs *gprs, + u64 sample_type) +{ + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) + adaptive_pebs_save_regs(regs, gprs); +} + +static inline void __setup_pebs_meminfo_group(struct perf_event *event, + struct perf_sample_data *data, + u64 sample_type, u64 latency, + u16 instr_latency, u64 address, + u64 aux, u64 tsx_tuning, u64 ax) +{ + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 tsx_latency = intel_get_tsx_weight(tsx_tuning); + + data->weight.var2_w = instr_latency; + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight.full = latency ?: tsx_latency; + else + data->weight.var1_dw = (u32)latency ?: tsx_latency; + + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = get_data_src(event, aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { + data->addr = address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = intel_get_tsx_transaction(tsx_tuning, ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } +} + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -2069,12 +2243,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; + u64 format_group; + u16 retire; if (basic == NULL) return; @@ -2082,31 +2258,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs = container_of(regs, struct x86_perf_regs, regs); perf_regs->xmm_regs = NULL; - sample_type = event->attr.sample_type; format_group = basic->format_group; - perf_sample_data_init(data, 0, event->hw.last_period); - setup_pebs_time(event, data, basic->tsc); - - /* - * We must however always use iregs for the unwinder to stay sane; the - * record BP,SP,IP can point into thin air when the record is from a - * previous PMI context or an (I)RET happened between the record and - * PMI. - */ - perf_sample_save_callchain(data, event, iregs); + __setup_perf_sample_data(event, iregs, data); *regs = *iregs; - /* The ip in basic is EventingIP */ - set_linear_ip(regs, basic->ip); - regs->flags = PERF_EFLAGS_EXACT; - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { - if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = basic->retire_latency; - else - data->weight.var3_w = 0; - } + /* basic group */ + retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ? + basic->retire_latency : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); /* * The record for MEMINFO is in front of GP @@ -2122,54 +2284,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, gprs = next_record; next_record = gprs + 1; - if (event->attr.precise_ip < 2) { - set_linear_ip(regs, gprs->ip); - regs->flags &= ~PERF_EFLAGS_EXACT; - } - - if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) - adaptive_pebs_save_regs(regs, gprs); + __setup_pebs_gpr_group(event, regs, gprs, sample_type); } if (format_group & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? - meminfo->cache_latency : meminfo->mem_latency; - - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) - data->weight.var2_w = meminfo->instr_latency; - - /* - * Although meminfo::latency is defined as a u64, - * only the lower 32 bits include the valid data - * in practice on Ice Lake and earlier platforms. - */ - if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } else { - data->weight.var1_dw = (u32)latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } - - data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; - } - - if (sample_type & PERF_SAMPLE_DATA_SRC) { - data->data_src.val = get_data_src(event, meminfo->aux); - data->sample_flags |= PERF_SAMPLE_DATA_SRC; - } - - if (sample_type & PERF_SAMPLE_ADDR_TYPE) { - data->addr = meminfo->address; - data->sample_flags |= PERF_SAMPLE_ADDR; - } - - if (sample_type & PERF_SAMPLE_TRANSACTION) { - data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, - gprs ? gprs->ax : 0); - data->sample_flags |= PERF_SAMPLE_TRANSACTION; - } + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; + u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->instr_latency : 0; + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, latency, + instr_latency, meminfo->address, + meminfo->aux, meminfo->tsx_tuning, + ax); } if (format_group & PEBS_DATACFG_XMMS) { @@ -2220,6 +2348,135 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, format_group); } +static inline bool arch_pebs_record_continued(struct arch_pebs_header *header) +{ + /* Continue bit or null PEBS record indicates fragment follows. */ + return header->cont || !(header->format & GENMASK_ULL(63, 16)); +} + +static void setup_arch_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, + void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; + struct arch_pebs_header *header = NULL; + struct arch_pebs_aux *meminfo = NULL; + struct arch_pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + void *next_record; + void *at = __pebs; + + if (at == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + __setup_perf_sample_data(event, iregs, data); + + *regs = *iregs; + +again: + header = at; + next_record = at + sizeof(struct arch_pebs_header); + if (header->basic) { + struct arch_pebs_basic *basic = next_record; + u16 retire = 0; + + next_record = basic + 1; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + retire = basic->valid ? basic->retire : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); + } + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (header->aux) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (header->gpr) { + gprs = next_record; + next_record = gprs + 1; + + __setup_pebs_gpr_group(event, regs, + (struct pebs_gprs *)gprs, + sample_type); + } + + if (header->aux) { + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, + meminfo->cache_latency, + meminfo->instr_latency, + meminfo->address, meminfo->aux, + meminfo->tsx_tuning, ax); + } + + if (header->xmm) { + struct pebs_xmm *xmm; + + next_record += sizeof(struct arch_pebs_xer_header); + + xmm = next_record; + perf_regs->xmm_regs = xmm->xmm; + next_record = xmm + 1; + } + + if (header->lbr) { + struct arch_pebs_lbr_header *lbr_header = next_record; + struct lbr_entry *lbr; + int num_lbr; + + next_record = lbr_header + 1; + lbr = next_record; + + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ? + lbr_header->depth : + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES; + next_record += num_lbr * sizeof(struct lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + intel_pmu_lbr_save_brstack(data, cpuc, event); + } + } + + if (header->cntr) { + struct arch_pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct arch_pebs_cntr_header); + + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, + (struct pebs_cntr_header *)cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + + /* Parse followed fragments if there are. */ + if (arch_pebs_record_continued(header)) { + at = at + header->size; + goto again; + } +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -2602,6 +2859,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } } +static __always_inline void +__intel_pmu_handle_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, u64 pebs_status, + short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, + last[bit], setup_sample); + } + + last[bit] = at; + } +} + +static __always_inline void +__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 mask, short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { + if (!counts[bit]) + continue; + + event = cpuc->events[bit]; + + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_sample); + } + +} + static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; @@ -2611,9 +2919,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; struct pebs_basic *basic; - struct perf_event *event; void *base, *at, *top; - int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2626,6 +2932,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d mask = hybrid(cpuc->pmu, pebs_events_mask) | (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); + mask &= cpuc->pebs_enabled; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, mask); @@ -2643,38 +2950,114 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (basic->format_size != cpuc->pebs_record_size) continue; - pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { - event = cpuc->events[bit]; + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_pebs_adaptive_sample_data); + } - if (WARN_ON_ONCE(!event) || - WARN_ON_ONCE(!event->attr.precise_ip)) - continue; + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last, + setup_pebs_adaptive_sample_data); +} - if (counts[bit]++) { - __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], - setup_pebs_adaptive_sample_data); - } - last[bit] = at; - } +static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, + struct perf_sample_data *data) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union arch_pebs_index index; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *base, *at, *top; + u64 mask; + + rdmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + if (unlikely(!index.wr)) { + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + return; } - for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (!counts[bit]) + base = cpuc->pebs_vaddr; + top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT); + + index.wr = 0; + index.full = 0; + index.en = 1; + if (cpuc->n_pebs == cpuc->n_large_pebs) + index.thresh = ARCH_PEBS_THRESH_MULTI; + else + index.thresh = ARCH_PEBS_THRESH_SINGLE; + wrmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; + + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top;) { + struct arch_pebs_header *header; + struct arch_pebs_basic *basic; + u64 pebs_status; + + header = at; + + if (WARN_ON_ONCE(!header->size)) + break; + + /* 1st fragment or single record must have basic group */ + if (!header->basic) { + at += header->size; continue; + } - event = cpuc->events[bit]; + basic = at + sizeof(struct arch_pebs_header); + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_arch_pebs_sample_data); + + /* Skip non-last fragments */ + while (arch_pebs_record_continued(header)) { + if (!header->size) + break; + at += header->size; + header = at; + } - __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], - counts[bit], setup_pebs_adaptive_sample_data); + /* Skip last fragment or the single record */ + at += header->size; } + + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, + counts, last, + setup_arch_pebs_sample_data); +} + +static void __init intel_arch_pebs_init(void) +{ + /* + * Current hybrid platforms always both support arch-PEBS or not + * on all kinds of cores. So directly set x86_pmu.arch_pebs flag + * if boot cpu supports arch-PEBS. + */ + x86_pmu.arch_pebs = 1; + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; + x86_pmu.pebs_capable = ~0ULL; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + + x86_pmu.pebs_enable = __intel_pmu_pebs_enable; + x86_pmu.pebs_disable = __intel_pmu_pebs_disable; } /* * PEBS probe and setup */ -void __init intel_pebs_init(void) +static void __init intel_ds_pebs_init(void) { /* * No support for 32bit formats @@ -2736,10 +3119,8 @@ void __init intel_pebs_init(void) break; case 6: - if (x86_pmu.intel_cap.pebs_baseline) { + if (x86_pmu.intel_cap.pebs_baseline) x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; - x86_pmu.late_setup = intel_pmu_late_setup; - } fallthrough; case 5: x86_pmu.pebs_ept = 1; @@ -2789,6 +3170,14 @@ void __init intel_pebs_init(void) } } +void __init intel_pebs_init(void) +{ + if (x86_pmu.intel_cap.pebs_format == 0xf) + intel_arch_pebs_init(); + else + intel_ds_pebs_init(); +} + void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d6c945cc5d07..e228e564b15e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1325,8 +1325,6 @@ static void uncore_pci_sub_driver_init(void) continue; pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - if (!pmu) - continue; if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) continue; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2b969386dcdd..3161ec0a3416 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -283,8 +283,9 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; - void *ds_pebs_vaddr; void *ds_bts_vaddr; + /* DS based PEBS or arch-PEBS buffer address */ + void *pebs_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; @@ -303,6 +304,8 @@ struct cpu_hw_events { /* Intel ACR configuration */ u64 acr_cfg_b[X86_PMC_IDX_MAX]; u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* Cached CFG_C values */ + u64 cfg_c_val[X86_PMC_IDX_MAX]; /* * Intel LBR bits @@ -708,6 +711,12 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; +struct arch_pebs_cap { + u64 caps; + u64 counters; + u64 pdists; +}; + struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -752,6 +761,8 @@ struct x86_hybrid_pmu { mid_ack :1, enabled_ack :1; + struct arch_pebs_cap arch_pebs_cap; + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; }; @@ -906,7 +917,7 @@ struct x86_pmu { union perf_capabilities intel_cap; /* - * Intel DebugStore bits + * Intel DebugStore and PEBS bits */ unsigned int bts :1, bts_active :1, @@ -917,7 +928,8 @@ struct x86_pmu { pebs_no_tlb :1, pebs_no_isolation :1, pebs_block :1, - pebs_ept :1; + pebs_ept :1, + arch_pebs :1; int pebs_record_size; int pebs_buffer_size; u64 pebs_events_mask; @@ -930,6 +942,11 @@ struct x86_pmu { u64 pebs_capable; /* + * Intel Architectural PEBS + */ + struct arch_pebs_cap arch_pebs_cap; + + /* * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, @@ -1124,7 +1141,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } -int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; @@ -1217,7 +1233,7 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); -int x86_pmu_max_precise(void); +int x86_pmu_max_precise(struct pmu *pmu); void hw_perf_lbr_event_destroy(struct perf_event *event); @@ -1604,6 +1620,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); +int alloc_arch_pebs_buf_on_cpu(int cpu); + +void release_arch_pebs_buf_on_cpu(int cpu); + +void init_arch_pebs_on_cpu(int cpu); + +void fini_arch_pebs_on_cpu(int cpu); + void init_debug_store_on_cpu(int cpu); void fini_debug_store_on_cpu(int cpu); @@ -1760,6 +1784,8 @@ void intel_pmu_pebs_data_source_cmt(void); void intel_pmu_pebs_data_source_lnl(void); +u64 intel_get_arch_pebs_data_config(struct perf_event *event); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -1792,6 +1818,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu) return fls((u32)hybrid(pmu, pebs_events_mask)); } +static inline bool intel_pmu_has_pebs(void) +{ + return x86_pmu.ds_pebs || x86_pmu.arch_pebs; +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 15bc07a5ebb3..b14c045679e1 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -198,6 +198,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_ENTRY(ft_flags) \ ".pushsection .altinstructions,\"a\"\n" \ + ANNOTATE_DATA_SPECIAL \ " .long 771b - .\n" /* label */ \ " .long 774f - .\n" /* new instruction */ \ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \ @@ -207,6 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end) #define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \ ".pushsection .altinstr_replacement, \"ax\"\n" \ + ANNOTATE_DATA_SPECIAL \ "# ALT: replacement\n" \ "774:\n\t" newinstr "\n775:\n" \ ".popsection\n" @@ -337,6 +339,7 @@ void nop_func(void); * instruction. See apply_alternatives(). */ .macro altinstr_entry orig alt ft_flags orig_len alt_len + ANNOTATE_DATA_SPECIAL .long \orig - . .long \alt - . .4byte \ft_flags @@ -365,6 +368,7 @@ void nop_func(void); .popsection ; \ .pushsection .altinstr_replacement,"ax" ; \ 743: \ + ANNOTATE_DATA_SPECIAL ; \ newinst ; \ 744: \ .popsection ; diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h index 23fe617898a8..a672b8765fa8 100644 --- a/arch/x86/include/asm/amd/node.h +++ b/arch/x86/include/asm/amd/node.h @@ -23,7 +23,6 @@ #define AMD_NODE0_PCI_SLOT 0x18 struct pci_dev *amd_node_get_func(u16 node, u8 func); -struct pci_dev *amd_node_get_root(u16 node); static inline u16 amd_num_nodes(void) { diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index d5c8d3afe196..bd62bd87a841 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H +#include <linux/annotate.h> + #ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ @@ -132,6 +134,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ + ANNOTATE_DATA_SPECIAL ; \ .long (from) - . ; \ .long (to) - . ; \ .long type ; \ @@ -179,6 +182,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ " .long " __stringify(type) " \n" \ @@ -187,6 +191,7 @@ static __always_inline __pure void *rip_rel_ptr(void *p) # define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ + ANNOTATE_DATA_SPECIAL \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ DEFINE_EXTABLE_TYPE_REG \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 880ca15073ed..ab5bba6cf7f5 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,6 +7,11 @@ #include <linux/objtool.h> #include <asm/asm.h> +#ifndef __ASSEMBLY__ +struct bug_entry; +extern void __WARN_trap(struct bug_entry *bug, ...); +#endif + /* * Despite that some emulators terminate on UD2, we use it for WARN(). */ @@ -31,52 +36,77 @@ #define BUG_UD2 0xfffe #define BUG_UD1 0xfffd #define BUG_UD1_UBSAN 0xfffc +#define BUG_UD1_WARN 0xfffb #define BUG_UDB 0xffd6 #define BUG_LOCK 0xfff0 #ifdef CONFIG_GENERIC_BUG -#ifdef CONFIG_X86_32 -# define __BUG_REL(val) ".long " val +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define __BUG_ENTRY_VERBOSE(file, line) \ + "\t.long " file " - .\t# bug_entry::file\n" \ + "\t.word " line "\t# bug_entry::line\n" #else -# define __BUG_REL(val) ".long " val " - ." +#define __BUG_ENTRY_VERBOSE(file, line) #endif -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define __BUG_ENTRY(file, line, flags) \ - "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ - "\t" __BUG_REL(file) "\t# bug_entry::file\n" \ - "\t.word " line "\t# bug_entry::line\n" \ - "\t.word " flags "\t# bug_entry::flags\n" +#if defined(CONFIG_X86_64) || defined(CONFIG_DEBUG_BUGVERBOSE_DETAILED) +#define HAVE_ARCH_BUG_FORMAT +#define __BUG_ENTRY_FORMAT(format) \ + "\t.long " format " - .\t# bug_entry::format\n" #else -#define __BUG_ENTRY(file, line, flags) \ - "2:\t" __BUG_REL("1b") "\t# bug_entry::bug_addr\n" \ - "\t.word " flags "\t# bug_entry::flags\n" +#define __BUG_ENTRY_FORMAT(format) +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_ARCH_BUG_FORMAT_ARGS #endif -#define _BUG_FLAGS_ASM(ins, file, line, flags, size, extra) \ - "1:\t" ins "\n" \ - ".pushsection __bug_table,\"aw\"\n" \ - __BUG_ENTRY(file, line, flags) \ +#define __BUG_ENTRY(format, file, line, flags) \ + "\t.long 1b - ." "\t# bug_entry::bug_addr\n" \ + __BUG_ENTRY_FORMAT(format) \ + __BUG_ENTRY_VERBOSE(file, line) \ + "\t.word " flags "\t# bug_entry::flags\n" + +#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \ + ".pushsection __bug_table,\"aw\"\n\t" \ + ANNOTATE_DATA_SPECIAL \ + "2:\n\t" \ + __BUG_ENTRY(format, file, line, flags) \ "\t.org 2b + " size "\n" \ ".popsection\n" \ extra -#define _BUG_FLAGS(ins, flags, extra) \ +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +#define WARN_CONDITION_STR(cond_str) cond_str +#else +#define WARN_CONDITION_STR(cond_str) "" +#endif + +#define _BUG_FLAGS(cond_str, ins, flags, extra) \ do { \ - asm_inline volatile(_BUG_FLAGS_ASM(ins, "%c0", \ - "%c1", "%c2", "%c3", extra) \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (flags), \ - "i" (sizeof(struct bug_entry))); \ + asm_inline volatile("1:\t" ins "\n" \ + _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \ + "%c[line]", "%c[fl]", \ + "%c[size]", extra) \ + : : [fmt] "i" (WARN_CONDITION_STR(cond_str)), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [fl] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ } while (0) #define ARCH_WARN_ASM(file, line, flags, size) \ - _BUG_FLAGS_ASM(ASM_UD2, file, line, flags, size, "") + ".pushsection .rodata.str1.1, \"aMS\", @progbits, 1\n" \ + "99:\n" \ + "\t.string \"\"\n" \ + ".popsection\n" \ + "1:\t " ASM_UD2 "\n" \ + _BUG_FLAGS_ASM("99b", file, line, flags, size, "") #else -#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) +#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -84,7 +114,7 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0, ""); \ + _BUG_FLAGS("", ASM_UD2, 0, ""); \ __builtin_unreachable(); \ } while (0) @@ -97,14 +127,69 @@ do { \ #define ARCH_WARN_REACHABLE ANNOTATE_REACHABLE(1b) -#define __WARN_FLAGS(flags) \ -do { \ - __auto_type __flags = BUGFLAG_WARNING|(flags); \ - instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ARCH_WARN_REACHABLE); \ - instrumentation_end(); \ +#define __WARN_FLAGS(cond_str, flags) \ +do { \ + __auto_type __flags = BUGFLAG_WARNING|(flags); \ + instrumentation_begin(); \ + _BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \ + instrumentation_end(); \ } while (0) +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS + +#ifndef __ASSEMBLY__ +#include <linux/static_call_types.h> +DECLARE_STATIC_CALL(WARN_trap, __WARN_trap); + +struct pt_regs; +struct sysv_va_list { /* from AMD64 System V ABI */ + unsigned int gp_offset; + unsigned int fp_offset; + void *overflow_arg_area; + void *reg_save_area; +}; +struct arch_va_list { + unsigned long regs[6]; + struct sysv_va_list args; +}; +extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs); +#endif /* __ASSEMBLY__ */ + +#define __WARN_bug_entry(flags, format) ({ \ + struct bug_entry *bug; \ + asm_inline volatile("lea (2f)(%%rip), %[addr]\n1:\n" \ + _BUG_FLAGS_ASM("%c[fmt]", "%c[file]", \ + "%c[line]", "%c[fl]", \ + "%c[size]", "") \ + : [addr] "=r" (bug) \ + : [fmt] "i" (format), \ + [file] "i" (__FILE__), \ + [line] "i" (__LINE__), \ + [fl] "i" (flags), \ + [size] "i" (sizeof(struct bug_entry))); \ + bug; }) + +#define __WARN_print_arg(flags, format, arg...) \ +do { \ + int __flags = (flags) | BUGFLAG_WARNING | BUGFLAG_ARGS ; \ + static_call_mod(WARN_trap)(__WARN_bug_entry(__flags, format), ## arg); \ + asm (""); /* inhibit tail-call optimization */ \ +} while (0) + +#define __WARN_printf(taint, fmt, arg...) \ + __WARN_print_arg(BUGFLAG_TAINT(taint), fmt, ## arg) + +#define WARN_ONCE(cond, format, arg...) ({ \ + int __ret_warn_on = !!(cond); \ + if (unlikely(__ret_warn_on)) { \ + __WARN_print_arg(BUGFLAG_ONCE|BUGFLAG_TAINT(TAINT_WARN),\ + format, ## arg); \ + } \ + __ret_warn_on; \ +}) + +#endif /* HAVE_ARCH_BUG_FORMAT_ARGS */ + #include <asm-generic/bug.h> #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 893cbca37fe9..fc5f32d4da6e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -101,6 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit) asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" + ANNOTATE_DATA_SPECIAL " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..9e54fc0e7ed3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -320,7 +320,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */ #define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ @@ -499,6 +499,9 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */ + +#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */ /* * BUG word(s) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 93156ac4ffe0..b08c95872eed 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return &arch_ftrace_regs(fregs)->regs; } +#define arch_ftrace_partial_regs(regs) do { \ + regs->flags &= ~X86_EFLAGS_FIXED; \ + regs->cs = __KERNEL_CS; \ +} while (0) + #define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \ (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 6e2458088800..fe5d9a10d900 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -46,38 +46,31 @@ do { \ } while(0) static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, - u32 __user *uaddr) + u32 __user *uaddr) { - if (can_do_masked_user_access()) - uaddr = masked_user_access_begin(uaddr); - else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - - switch (op) { - case FUTEX_OP_SET: - unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); - break; - case FUTEX_OP_ADD: - unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, - uaddr, oparg, Efault); - break; - case FUTEX_OP_OR: - unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); - break; - case FUTEX_OP_ANDN: - unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); - break; - case FUTEX_OP_XOR: - unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); - break; - default: - user_access_end(); - return -ENOSYS; + scoped_user_rw_access(uaddr, Efault) { + switch (op) { + case FUTEX_OP_SET: + unsafe_atomic_op1("xchgl %0, %2", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_ADD: + unsafe_atomic_op1(LOCK_PREFIX "xaddl %0, %2", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_OR: + unsafe_atomic_op2("orl %4, %3", oval, uaddr, oparg, Efault); + break; + case FUTEX_OP_ANDN: + unsafe_atomic_op2("andl %4, %3", oval, uaddr, ~oparg, Efault); + break; + case FUTEX_OP_XOR: + unsafe_atomic_op2("xorl %4, %3", oval, uaddr, oparg, Efault); + break; + default: + return -ENOSYS; + } } - user_access_end(); return 0; Efault: - user_access_end(); return -EFAULT; } @@ -86,21 +79,19 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; - if (can_do_masked_user_access()) - uaddr = masked_user_access_begin(uaddr); - else if (!user_access_begin(uaddr, sizeof(u32))) - return -EFAULT; - asm volatile("\n" - "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" - "2:\n" - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \ - : "+r" (ret), "=a" (oldval), "+m" (*uaddr) - : "r" (newval), "1" (oldval) - : "memory" - ); - user_access_end(); - *uval = oldval; + scoped_user_rw_access(uaddr, Efault) { + asm_inline volatile("\n" + "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" + "2:\n" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) + : "+r" (ret), "=a" (oldval), "+m" (*uaddr) + : "r" (newval), "1" (oldval) + : "memory"); + *uval = oldval; + } return ret; +Efault: + return -EFAULT; } #endif diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index abd637e54e94..3218770670d3 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -393,7 +393,7 @@ static __always_inline void __##func(struct pt_regs *regs) /** * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler - when raised from kernel mode + * when raised from kernel mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE @@ -403,7 +403,7 @@ static __always_inline void __##func(struct pt_regs *regs) /** * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler - when raised from user mode + * when raised from user mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 54368a43abf6..4733e9064ee5 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -44,4 +44,6 @@ enum insn_mmio_type { enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +bool insn_is_nop(struct insn *insn); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 091f88c8254d..846d21c1a7f8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 5dbeac48a5b9..695f87efbeb8 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -4,7 +4,15 @@ #include <linux/percpu-defs.h> #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SHIFT 4 +#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) + +/* + * The largest PEBS record could consume a page, ensure + * a record at least can be written after triggering PMI. + */ +#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT) +#define ARCH_PEBS_THRESH_SINGLE 1 /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 61dd1dee7812..e0a6930a4029 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -15,6 +15,7 @@ #define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ + ANNOTATE_DATA_SPECIAL \ ".long 1b - . \n\t" \ ".long " label " - . \n\t" \ _ASM_PTR " " key " - . \n\t" \ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 31e3cb550fb3..2d98886de09a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -48,6 +48,7 @@ /* AMD-specific bits */ #define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ +#define MCI_STATUS_PADDRV BIT_ULL(54) /* Valid System Physical Address */ #define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ @@ -62,6 +63,7 @@ */ #define MCI_CONFIG_MCAX 0x1 #define MCI_CONFIG_FRUTEXT BIT_ULL(9) +#define MCI_CONFIG_PADDRV BIT_ULL(11) #define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_HWID 0xFFF @@ -166,6 +168,12 @@ #define MCE_IN_KERNEL_COPYIN BIT_ULL(7) /* + * Indicates that handler should check and clear Deferred error registers + * rather than common ones. + */ +#define MCE_CHECK_DFR_REGS BIT_ULL(8) + +/* * This structure contains all data related to the MCE log. Also * carries a signature to make it easier to find from external * debugging tools. Each entry is only valid when its finished flag @@ -302,6 +310,12 @@ DECLARE_PER_CPU(struct mce, injectm); /* Disable CMCI/polling for MCA bank claimed by firmware */ extern void mce_disable_bank(int bank); +#ifdef CONFIG_X86_MCE_THRESHOLD +void mce_save_apei_thr_limit(u32 thr_limit); +#else +static inline void mce_save_apei_thr_limit(u32 thr_limit) { } +#endif /* CONFIG_X86_MCE_THRESHOLD */ + /* * Exception handler */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9e1720d73244..3d0a0950d20a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -166,6 +166,10 @@ * Processor MMIO stale data * vulnerabilities. */ +#define ARCH_CAP_MCU_ENUM BIT(16) /* + * Indicates the presence of microcode update + * feature enumeration and status information. + */ #define ARCH_CAP_FB_CLEAR BIT(17) /* * VERW clears CPU fill buffer * even on MDS_NO CPUs. @@ -327,6 +331,26 @@ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ PERF_CAP_PEBS_TIMING_INFO) +/* Arch PEBS */ +#define MSR_IA32_PEBS_BASE 0x000003f4 +#define MSR_IA32_PEBS_INDEX 0x000003f5 +#define ARCH_PEBS_OFFSET_MASK 0x7fffff +#define ARCH_PEBS_INDEX_WR_SHIFT 4 + +#define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) +#define ARCH_PEBS_CNTR_GP BIT_ULL(36) +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) +#define ARCH_PEBS_LBR_SHIFT 40 +#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) +#define ARCH_PEBS_VECR_XMM BIT_ULL(49) +#define ARCH_PEBS_GPR BIT_ULL(61) +#define ARCH_PEBS_AUX BIT_ULL(62) +#define ARCH_PEBS_EN BIT_ULL(63) +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ + ARCH_PEBS_CNTR_METRICS) + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) @@ -929,6 +953,10 @@ #define MSR_IA32_APICBASE_BASE (0xfffff<<12) #define MSR_IA32_UCODE_WRITE 0x00000079 + +#define MSR_IA32_MCU_ENUMERATION 0x0000007b +#define MCU_STAGING BIT(4) + #define MSR_IA32_UCODE_REV 0x0000008b /* Intel SGX Launch Enclave Public Key Hash MSRs */ @@ -1226,6 +1254,8 @@ #define MSR_IA32_VMX_VMFUNC 0x00000491 #define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 +#define MSR_IA32_MCU_STAGING_MBOX_ADDR 0x000007a5 + /* Resctrl MSRs: */ /* - Intel: */ #define MSR_IA32_L3_QOS_CFG 0xc81 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 53f4089333f2..2f0e47be79a4 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -9,6 +9,7 @@ #include <asm/alternative.h> #include <linux/kmsan-checks.h> +#include <linux/mmdebug.h> /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; @@ -31,13 +32,20 @@ static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); -extern unsigned long __phys_addr_symbol(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) -#define __phys_addr_symbol(x) \ - ((unsigned long)(x) - __START_KERNEL_map + phys_base) #endif +static inline unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} + #define __phys_reloc_hide(x) (x) void clear_page_orig(void *page); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 332428caaed2..725d0eff7acd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -23,6 +23,7 @@ #else /* !__ASSEMBLY__: */ #include <linux/args.h> +#include <linux/bits.h> #include <linux/build_bug.h> #include <linux/stringify.h> #include <asm/asm.h> @@ -572,9 +573,9 @@ do { \ #define x86_this_cpu_constant_test_bit(_nr, _var) \ ({ \ unsigned long __percpu *addr__ = \ - (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \ + (unsigned long __percpu *)&(_var) + BIT_WORD(_nr); \ \ - !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \ + !!(BIT_MASK(_nr) & raw_cpu_read(*addr__)); \ }) #define x86_this_cpu_variable_test_bit(_nr, _var) \ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 49a4d442f3fc..7276ba70c88a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,16 +141,16 @@ #define ARCH_PERFMON_EVENTS_COUNT 7 #define PEBS_DATACFG_MEMINFO BIT_ULL(0) -#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_GP BIT_ULL(1) #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) -#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_METRICS BIT_ULL(5) +#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR_SHIFT 32 #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) #define PEBS_DATACFG_FIX_SHIFT 48 #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) -#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -200,6 +200,8 @@ union cpuid10_edx { #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 #define ARCH_PERFMON_ACR_LEAF 0x2 +#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4 +#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5 union cpuid35_eax { struct { @@ -210,7 +212,10 @@ union cpuid35_eax { unsigned int acr_subleaf:1; /* Events Sub-Leaf */ unsigned int events_subleaf:1; - unsigned int reserved:28; + /* arch-PEBS Sub-Leaves */ + unsigned int pebs_caps_subleaf:1; + unsigned int pebs_cnts_subleaf:1; + unsigned int reserved:26; } split; unsigned int full; }; @@ -432,6 +437,8 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54 +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) @@ -503,6 +510,107 @@ struct pebs_cntr_header { #define INTEL_CNTR_METRICS 0x3 /* + * Arch PEBS + */ +union arch_pebs_index { + struct { + u64 rsvd:4, + wr:23, + rsvd2:4, + full:1, + en:1, + rsvd3:3, + thresh:23, + rsvd4:5; + }; + u64 whole; +}; + +struct arch_pebs_header { + union { + u64 format; + struct { + u64 size:16, /* Record size */ + rsvd:14, + mode:1, /* 64BIT_MODE */ + cont:1, + rsvd2:3, + cntr:5, + lbr:2, + rsvd3:7, + xmm:1, + ymmh:1, + rsvd4:2, + opmask:1, + zmmh:1, + h16zmm:1, + rsvd5:5, + gpr:1, + aux:1, + basic:1; + }; + }; + u64 rsvd6; +}; + +struct arch_pebs_basic { + u64 ip; + u64 applicable_counters; + u64 tsc; + u64 retire :16, /* Retire Latency */ + valid :1, + rsvd :47; + u64 rsvd2; + u64 rsvd3; +}; + +struct arch_pebs_aux { + u64 address; + u64 rsvd; + u64 rsvd2; + u64 rsvd3; + u64 rsvd4; + u64 aux; + u64 instr_latency :16, + pad2 :16, + cache_latency :16, + pad3 :16; + u64 tsx_tuning; +}; + +struct arch_pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp; + u64 rsvd; +}; + +struct arch_pebs_xer_header { + u64 xstate; + u64 rsvd; +}; + +#define ARCH_PEBS_LBR_NAN 0x0 +#define ARCH_PEBS_LBR_NUM_8 0x1 +#define ARCH_PEBS_LBR_NUM_16 0x2 +#define ARCH_PEBS_LBR_NUM_VAR 0x3 +#define ARCH_PEBS_BASE_LBR_ENTRIES 8 +struct arch_pebs_lbr_header { + u64 rsvd; + u64 ctl; + u64 depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; +}; + +struct arch_pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +/* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ #define EXT_PERFMON_DEBUG_FEATURES 0x80000022 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 50f75467f73d..b5dec859bc75 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -187,12 +187,12 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); -static inline unsigned long regs_return_value(struct pt_regs *regs) +static __always_inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; } -static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) { regs->ax = rc; } @@ -277,34 +277,34 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) } #endif -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline unsigned long instruction_pointer(struct pt_regs *regs) +static __always_inline unsigned long instruction_pointer(struct pt_regs *regs) { return regs->ip; } -static inline void instruction_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { regs->ip = val; } -static inline unsigned long frame_pointer(struct pt_regs *regs) +static __always_inline unsigned long frame_pointer(struct pt_regs *regs) { return regs->bp; } -static inline unsigned long user_stack_pointer(struct pt_regs *regs) +static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs) { return regs->sp; } -static inline void user_stack_pointer_set(struct pt_regs *regs, - unsigned long val) +static __always_inline +void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) { regs->sp = val; } diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index 8d983cfd06ea..e5a13dc8816e 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -2,6 +2,10 @@ #ifndef _ASM_RUNTIME_CONST_H #define _ASM_RUNTIME_CONST_H +#ifdef MODULE + #error "Cannot use runtime-const infrastructure from modules" +#endif + #ifdef __ASSEMBLY__ .macro RUNTIME_CONST_PTR sym reg diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 6a0069761508..04958459a7ca 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/** +/* * Copyright(c) 2016-20 Intel Corporation. * * Intel Software Guard Extensions (SGX) support. @@ -28,21 +28,22 @@ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, - EAUG = 0x0D, - EMODPR = 0x0E, - EMODT = 0x0F, + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, + EUPDATESVN = 0x18, }; /** @@ -65,15 +66,19 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV - * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. - * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * @SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. + * @SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. - * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. - * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * @SGX_CHILD_PRESENT: SECS has child pages present in the EPC. + * @SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. - * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * @SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it * is in the PENDING or MODIFIED state. - * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + * @SGX_INSUFFICIENT_ENTROPY: Insufficient entropy in RNG. + * @SGX_NO_UPDATE: EUPDATESVN could not update the CPUSVN because the + * current SVN was not newer than CPUSVN. This is the most + * common error code returned by EUPDATESVN. + * @SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_EPC_PAGE_CONFLICT = 7, @@ -81,6 +86,8 @@ enum sgx_return_code { SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_PAGE_NOT_MODIFIABLE = 20, + SGX_INSUFFICIENT_ENTROPY = 29, + SGX_NO_UPDATE = 31, SGX_UNMASKED_EVENT = 128, }; @@ -89,7 +96,7 @@ enum sgx_return_code { /** * enum sgx_miscselect - additional information to an SSA frame - * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * @SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. * * Save State Area (SSA) is a stack inside the enclave used to store processor * state when an exception or interrupt occurs. This enum defines additional @@ -105,17 +112,17 @@ enum sgx_miscselect { #define SGX_SSA_MISC_EXINFO_SIZE 16 /** - * enum sgx_attributes - the attributes field in &struct sgx_secs - * %SGX_ATTR_INIT: Enclave can be entered (is initialized). - * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). - * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. - * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * enum sgx_attribute - the attributes field in &struct sgx_secs + * @SGX_ATTR_INIT: Enclave can be entered (is initialized). + * @SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * @SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * @SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote * attestation. - * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). - * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * @SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * @SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. - * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * @SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an * asynchronous exit has occurred. */ enum sgx_attribute { @@ -188,7 +195,7 @@ struct sgx_secs { /** * enum sgx_tcs_flags - execution flags for TCS - * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * @SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints * inside an enclave. It is cleared by EADD but can * be set later with EDBGWR. */ @@ -253,11 +260,11 @@ struct sgx_pageinfo { /** * enum sgx_page_type - bits in the SECINFO flags defining the page type - * %SGX_PAGE_TYPE_SECS: a SECS page - * %SGX_PAGE_TYPE_TCS: a TCS page - * %SGX_PAGE_TYPE_REG: a regular page - * %SGX_PAGE_TYPE_VA: a VA page - * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * @SGX_PAGE_TYPE_SECS: a SECS page + * @SGX_PAGE_TYPE_TCS: a TCS page + * @SGX_PAGE_TYPE_REG: a regular page + * @SGX_PAGE_TYPE_VA: a VA page + * @SGX_PAGE_TYPE_TRIM: a page in trimmed state * * Make sure when making changes to this enum that its values can still fit * in the bitfield within &struct sgx_encl_page @@ -275,14 +282,14 @@ enum sgx_page_type { /** * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo - * %SGX_SECINFO_R: allow read - * %SGX_SECINFO_W: allow write - * %SGX_SECINFO_X: allow execution - * %SGX_SECINFO_SECS: a SECS page - * %SGX_SECINFO_TCS: a TCS page - * %SGX_SECINFO_REG: a regular page - * %SGX_SECINFO_VA: a VA page - * %SGX_SECINFO_TRIM: a page in trimmed state + * @SGX_SECINFO_R: allow read + * @SGX_SECINFO_W: allow write + * @SGX_SECINFO_X: allow execution + * @SGX_SECINFO_SECS: a SECS page + * @SGX_SECINFO_TCS: a TCS page + * @SGX_SECINFO_REG: a regular page + * @SGX_SECINFO_VA: a VA page + * @SGX_SECINFO_TRIM: a page in trimmed state */ enum sgx_secinfo_flags { SGX_SECINFO_R = BIT(0), diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h index 1e6ec10b3a15..a20b1c08c99f 100644 --- a/arch/x86/include/asm/shared/msr.h +++ b/arch/x86/include/asm/shared/msr.h @@ -12,4 +12,19 @@ struct msr { }; }; +/* + * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the + * boot kernel since they rely on tracepoint/exception handling infrastructure + * that's not available here. + */ +static inline void raw_rdmsr(unsigned int reg, struct msr *m) +{ + asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); +} + +static inline void raw_wrmsr(unsigned int reg, const struct msr *m) +{ + asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); +} + #endif /* _ASM_X86_SHARED_MSR_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 22bfebe6776d..84951572ab81 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -109,7 +109,7 @@ int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_kick_ap(unsigned int cpu, struct task_struct *tidle); int native_cpu_disable(void); void __noreturn hlt_play_dead(void); -void native_play_dead(void); +void __noreturn native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); void wbinvd_on_all_cpus(void); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 17f6c3fedeee..0581c477d466 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -701,5 +701,6 @@ DEFINE_GHCB_ACCESSORS(sw_exit_info_1) DEFINE_GHCB_ACCESSORS(sw_exit_info_2) DEFINE_GHCB_ACCESSORS(sw_scratch) DEFINE_GHCB_ACCESSORS(xcr0) +DEFINE_GHCB_ACCESSORS(xss) #endif diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 21041898157a..1fadf0cf520c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -218,6 +218,12 @@ static inline unsigned int topology_amd_nodes_per_pkg(void) return __amd_nodes_per_pkg; } +#else /* CONFIG_SMP */ +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_max_smt_threads(void) { return 1; } +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } +#endif /* !CONFIG_SMP */ + extern struct cpumask __cpu_primary_thread_mask; #define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) @@ -241,12 +247,6 @@ static inline bool topology_is_core_online(unsigned int cpu) } #define topology_is_core_online topology_is_core_online -#else /* CONFIG_SMP */ -static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_max_smt_threads(void) { return 1; } -static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } -#endif /* !CONFIG_SMP */ - static inline void arch_fix_phys_package_id(int num, u32 slot) { } @@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick +extern int arch_sched_node_distance(int from, int to); + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 91a3fb8ae7ff..367297b188c3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -528,18 +528,18 @@ static __must_check __always_inline bool user_access_begin(const void __user *pt #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) -#define unsafe_put_user(x, ptr, label) \ +#define arch_unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define unsafe_get_user(x, ptr, err_label) \ +#define arch_unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define unsafe_get_user(x, ptr, err_label) \ +#define arch_unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ @@ -618,11 +618,11 @@ do { \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __get_kernel_nofault(dst, src, type, err_label) \ +#define arch_get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ @@ -633,7 +633,7 @@ do { \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT -#define __put_kernel_nofault(dst, src, type, err_label) \ +#define arch_put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c8a5ae35c871..641f45c22f9d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -12,12 +12,12 @@ #include <asm/cpufeatures.h> #include <asm/page.h> #include <asm/percpu.h> -#include <asm/runtime-const.h> -/* - * Virtual variable: there's no actual backing store for this, - * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)' - */ +#ifdef MODULE + #define runtime_const_ptr(sym) (sym) +#else + #include <asm/runtime-const.h> +#endif extern unsigned long USER_PTR_MAX; #ifdef CONFIG_ADDRESS_MASKING diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h new file mode 100644 index 000000000000..12064284bc4e --- /dev/null +++ b/arch/x86/include/asm/unwind_user.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_UNWIND_USER_H +#define _ASM_X86_UNWIND_USER_H + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + +#include <asm/ptrace.h> +#include <asm/uprobes.h> + +#define ARCH_INIT_USER_FP_FRAME(ws) \ + .cfa_off = 2*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = -2*(ws), \ + .use_fp = true, + +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + +static inline int unwind_user_word_size(struct pt_regs *regs) +{ + /* We can't unwind VM86 stacks */ + if (regs->flags & X86_VM_MASK) + return 0; +#ifdef CONFIG_X86_64 + if (!user_64bit_mode(regs)) + return sizeof(int); +#endif + return sizeof(long); +} + +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return is_uprobe_at_func_entry(regs); +} + +#endif /* CONFIG_HAVE_UNWIND_USER_FP */ + +#endif /* _ASM_X86_UNWIND_USER_H */ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1ee2e5115955..362210c79998 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -62,4 +62,13 @@ struct arch_uprobe_task { unsigned int saved_tf; }; +#ifdef CONFIG_UPROBES +extern bool is_uprobe_at_func_entry(struct pt_regs *regs); +#else +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ + #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 2dd35bbdc822..3c4d52072189 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -10,7 +10,7 @@ /** * enum sgx_page_flags - page control flags - * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of + * @SGX_PAGE_MEASURE: Measure the page contents with a sequence of * ENCLS[EEXTEND] operations. */ enum sgx_page_flags { @@ -143,6 +143,12 @@ struct sgx_enclave_run; /** * typedef sgx_enclave_user_handler_t - Exit handler function accepted by * __vdso_sgx_enter_enclave() + * @rdi: RDI at the time of EEXIT, undefined on AEX + * @rsi: RSI at the time of EEXIT, undefined on AEX + * @rdx: RDX at the time of EEXIT, undefined on AEX + * @rsp: RSP (untrusted) at the time of EEXIT or AEX + * @r8: R8 at the time of EEXIT, undefined on AEX + * @r9: R9 at the time of EEXIT, undefined on AEX * @run: The run instance given by the caller * * The register parameters contain the snapshot of their values at enclave @@ -166,7 +172,7 @@ typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx, * @exception_addr: The address that triggered the exception * @user_handler: User provided callback run on exception * @user_data: Data passed to the user handler - * @reserved Reserved for future extensions + * @reserved: Reserved for future extensions * * If @user_handler is provided, the handler will be invoked on all return paths * of the normal flow. The user handler may transfer control, e.g. via a diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 9792e329343e..1baa86dfe029 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -93,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 #define EXIT_REASON_MSR_READ_IMM 84 #define EXIT_REASON_MSR_WRITE_IMM 85 diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index 0916f00a992e..e21419e686eb 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) if (!cmc->enabled) return 0; + mce_save_apei_thr_limit(cmc->notify.error_threshold_value); + /* * We expect HEST to provide a list of MC banks that report errors * in firmware first mode. Otherwise, return non-zero value to diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 7047124490f6..d7c8ef1e354d 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected) break; } - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { u32 tmp; int ret; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8ee5ff547357..e377b06e70e3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -9,6 +9,7 @@ #include <asm/text-patching.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/ibt.h> #include <asm/set_memory.h> #include <asm/nmi.h> @@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len) } /* - * Matches NOP and NOPL, not any of the other possible NOPs. - */ -static bool insn_is_nop(struct insn *insn) -{ - /* Anything NOP, but no REP NOP */ - if (insn->opcode.bytes[0] == 0x90 && - (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) - return true; - - /* NOPL */ - if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) - return true; - - /* TODO: more nops */ - - return false; -} - -/* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ @@ -559,7 +541,7 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +static unsigned int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { void *target, *bug = &BUG_func; s32 disp; @@ -643,7 +625,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - int insn_buff_sz = 0; + unsigned int insn_buff_sz = 0; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -683,11 +665,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - if (a->flags & ALT_FLAG_DIRECT_CALL) { + if (a->flags & ALT_FLAG_DIRECT_CALL) insn_buff_sz = alt_replace_call(instr, insn_buff, a); - if (insn_buff_sz < 0) - continue; - } for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; @@ -2244,21 +2223,34 @@ int alternatives_text_reserved(void *start, void *end) * See entry_{32,64}.S for more details. */ -/* - * We define the int3_magic() function in assembly to control the calling - * convention such that we can 'call' it from assembly. - */ +extern void int3_selftest_asm(unsigned int *ptr); -extern void int3_magic(unsigned int *ptr); /* defined in asm */ +asm ( +" .pushsection .init.text, \"ax\", @progbits\n" +" .type int3_selftest_asm, @function\n" +"int3_selftest_asm:\n" + ANNOTATE_NOENDBR + /* + * INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an + * exception, then the int3_exception_nb notifier emulates a call to + * int3_selftest_callee(). + */ +" int3; nop; nop; nop; nop\n" + ASM_RET +" .size int3_selftest_asm, . - int3_selftest_asm\n" +" .popsection\n" +); + +extern void int3_selftest_callee(unsigned int *ptr); asm ( " .pushsection .init.text, \"ax\", @progbits\n" -" .type int3_magic, @function\n" -"int3_magic:\n" +" .type int3_selftest_callee, @function\n" +"int3_selftest_callee:\n" ANNOTATE_NOENDBR -" movl $1, (%" _ASM_ARG1 ")\n" +" movl $0x1234, (%" _ASM_ARG1 ")\n" ASM_RET -" .size int3_magic, .-int3_magic\n" +" .size int3_selftest_callee, . - int3_selftest_callee\n" " .popsection\n" ); @@ -2267,7 +2259,7 @@ extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { - unsigned long selftest = (unsigned long)&int3_selftest_ip; + unsigned long selftest = (unsigned long)&int3_selftest_asm; struct die_args *args = data; struct pt_regs *regs = args->regs; @@ -2282,7 +2274,7 @@ int3_exception_notify(struct notifier_block *self, unsigned long val, void *data if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; - int3_emulate_call(regs, (unsigned long)&int3_magic); + int3_emulate_call(regs, (unsigned long)&int3_selftest_callee); return NOTIFY_STOP; } @@ -2298,19 +2290,11 @@ static noinline void __init int3_selftest(void) BUG_ON(register_die_notifier(&int3_exception_nb)); /* - * Basically: int3_magic(&val); but really complicated :-) - * - * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb - * notifier above will emulate CALL for us. + * Basically: int3_selftest_callee(&val); but really complicated :-) */ - asm volatile ("int3_selftest_ip:\n\t" - ANNOTATE_NOENDBR - " int3; nop; nop; nop; nop\n\t" - : ASM_CALL_CONSTRAINT - : __ASM_SEL_RAW(a, D) (&val) - : "memory"); - - BUG_ON(val != 1); + int3_selftest_asm(&val); + + BUG_ON(val != 0x1234); unregister_die_notifier(&int3_exception_nb); } diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index a40176b62eb5..3d0a4768d603 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -34,62 +34,6 @@ struct pci_dev *amd_node_get_func(u16 node, u8 func) return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); } -#define DF_BLK_INST_CNT 0x040 -#define DF_CFG_ADDR_CNTL_LEGACY 0x084 -#define DF_CFG_ADDR_CNTL_DF4 0xC04 - -#define DF_MAJOR_REVISION GENMASK(27, 24) - -static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0) -{ - u32 reg; - - /* - * Revision fields added for DF4 and later. - * - * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. - */ - if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®)) - return 0; - - if (reg & DF_MAJOR_REVISION) - return DF_CFG_ADDR_CNTL_DF4; - - return DF_CFG_ADDR_CNTL_LEGACY; -} - -struct pci_dev *amd_node_get_root(u16 node) -{ - struct pci_dev *root; - u16 cntl_off; - u8 bus; - - if (!cpu_feature_enabled(X86_FEATURE_ZEN)) - return NULL; - - /* - * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl) - * Bits [7:0] (SecBusNum) holds the bus number of the root device for - * this Data Fabric instance. The segment, device, and function will be 0. - */ - struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0); - if (!df_f0) - return NULL; - - cntl_off = get_cfg_addr_cntl_offset(df_f0); - if (!cntl_off) - return NULL; - - if (pci_read_config_byte(df_f0, cntl_off, &bus)) - return NULL; - - /* Grab the pointer for the actual root device instance. */ - root = pci_get_domain_bus_and_slot(0, bus, 0); - - pci_dbg(root, "is root for AMD node %u\n", node); - return root; -} - static struct pci_dev **amd_roots; /* Protect the PCI config register pairs used for SMN. */ @@ -274,51 +218,21 @@ DEFINE_SHOW_STORE_ATTRIBUTE(smn_node); DEFINE_SHOW_STORE_ATTRIBUTE(smn_address); DEFINE_SHOW_STORE_ATTRIBUTE(smn_value); -static int amd_cache_roots(void) -{ - u16 node, num_nodes = amd_num_nodes(); - - amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); - if (!amd_roots) - return -ENOMEM; - - for (node = 0; node < num_nodes; node++) - amd_roots[node] = amd_node_get_root(node); - - return 0; -} - -static int reserve_root_config_spaces(void) +static struct pci_dev *get_next_root(struct pci_dev *root) { - struct pci_dev *root = NULL; - struct pci_bus *bus = NULL; - - while ((bus = pci_find_next_bus(bus))) { - /* Root device is Device 0 Function 0 on each Primary Bus. */ - root = pci_get_slot(bus, 0); - if (!root) + while ((root = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, root))) { + /* Root device is Device 0 Function 0. */ + if (root->devfn) continue; if (root->vendor != PCI_VENDOR_ID_AMD && root->vendor != PCI_VENDOR_ID_HYGON) continue; - pci_dbg(root, "Reserving PCI config space\n"); - - /* - * There are a few SMN index/data pairs and other registers - * that shouldn't be accessed by user space. - * So reserve the entire PCI config space for simplicity rather - * than covering specific registers piecemeal. - */ - if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { - pci_err(root, "Failed to reserve config space\n"); - return -EEXIST; - } + break; } - smn_exclusive = true; - return 0; + return root; } static bool enable_dfs; @@ -332,7 +246,8 @@ __setup("amd_smn_debugfs_enable", amd_smn_enable_dfs); static int __init amd_smn_init(void) { - int err; + u16 count, num_roots, roots_per_node, node, num_nodes; + struct pci_dev *root; if (!cpu_feature_enabled(X86_FEATURE_ZEN)) return 0; @@ -342,13 +257,48 @@ static int __init amd_smn_init(void) if (amd_roots) return 0; - err = amd_cache_roots(); - if (err) - return err; + num_roots = 0; + root = NULL; + while ((root = get_next_root(root))) { + pci_dbg(root, "Reserving PCI config space\n"); - err = reserve_root_config_spaces(); - if (err) - return err; + /* + * There are a few SMN index/data pairs and other registers + * that shouldn't be accessed by user space. So reserve the + * entire PCI config space for simplicity rather than covering + * specific registers piecemeal. + */ + if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { + pci_err(root, "Failed to reserve config space\n"); + return -EEXIST; + } + + num_roots++; + } + + pr_debug("Found %d AMD root devices\n", num_roots); + + if (!num_roots) + return -ENODEV; + + num_nodes = amd_num_nodes(); + amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); + if (!amd_roots) + return -ENOMEM; + + roots_per_node = num_roots / num_nodes; + + count = 0; + node = 0; + root = NULL; + while (node < num_nodes && (root = get_next_root(root))) { + /* Use one root for each node and skip the rest. */ + if (count++ % roots_per_node) + continue; + + pci_dbg(root, "is root for AMD node %u\n", node); + amd_roots[node++] = root; + } if (enable_dfs) { debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir); @@ -358,6 +308,8 @@ static int __init amd_smn_init(void) debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops); } + smn_exclusive = true; + return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dcf4dc7a9eac..9c29e12b84e5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -174,6 +174,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; +/* Measured in ticks per HZ. */ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); @@ -793,6 +794,7 @@ static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); u64 tsc_perj = 0, tsc_start = 0; + long delta_tsc_khz, bus_khz; unsigned long jif_start; unsigned long deltaj; long delta, deltatsc; @@ -895,14 +897,15 @@ static int __init calibrate_APIC_clock(void) apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { - apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n", - (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + delta_tsc_khz = (deltatsc * HZ) / (1000 * LAPIC_CAL_LOOPS); + + apic_pr_verbose("..... CPU clock speed is %ld.%03ld MHz.\n", + delta_tsc_khz / 1000, delta_tsc_khz % 1000); } - apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n", - lapic_timer_period / (1000000 / HZ), - lapic_timer_period % (1000000 / HZ)); + bus_khz = (long)lapic_timer_period * HZ / 1000; + apic_pr_verbose("..... host bus clock speed is %ld.%03ld MHz.\n", + bus_khz / 1000, bus_khz % 1000); /* * Do a sanity check on the APIC calibration result diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5ba2feb2c04c..1e0442e867b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2864,7 +2864,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, ioapic = mp_irqdomain_ioapic_idx(domain); pin = info->ioapic.pin; - if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) + if (irq_resolve_mapping(domain, (irq_hw_number_t)pin)) return -EEXIST; data = kzalloc(sizeof(*data), GFP_KERNEL); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 69a3c02cab48..bc94ff1e250a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1037,7 +1037,15 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) static const struct x86_cpu_id zen5_rdseed_microcode[] = { ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a), + ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121), ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054), + ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108), + ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038), + ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037), + {}, }; static void init_amd_zen5(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d11a7655994e..d8660770dc6a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -54,53 +54,6 @@ * mitigation option. */ -static void __init spectre_v1_select_mitigation(void); -static void __init spectre_v1_apply_mitigation(void); -static void __init spectre_v2_select_mitigation(void); -static void __init spectre_v2_update_mitigation(void); -static void __init spectre_v2_apply_mitigation(void); -static void __init retbleed_select_mitigation(void); -static void __init retbleed_update_mitigation(void); -static void __init retbleed_apply_mitigation(void); -static void __init spectre_v2_user_select_mitigation(void); -static void __init spectre_v2_user_update_mitigation(void); -static void __init spectre_v2_user_apply_mitigation(void); -static void __init ssb_select_mitigation(void); -static void __init ssb_apply_mitigation(void); -static void __init l1tf_select_mitigation(void); -static void __init l1tf_apply_mitigation(void); -static void __init mds_select_mitigation(void); -static void __init mds_update_mitigation(void); -static void __init mds_apply_mitigation(void); -static void __init taa_select_mitigation(void); -static void __init taa_update_mitigation(void); -static void __init taa_apply_mitigation(void); -static void __init mmio_select_mitigation(void); -static void __init mmio_update_mitigation(void); -static void __init mmio_apply_mitigation(void); -static void __init rfds_select_mitigation(void); -static void __init rfds_update_mitigation(void); -static void __init rfds_apply_mitigation(void); -static void __init srbds_select_mitigation(void); -static void __init srbds_apply_mitigation(void); -static void __init l1d_flush_select_mitigation(void); -static void __init srso_select_mitigation(void); -static void __init srso_update_mitigation(void); -static void __init srso_apply_mitigation(void); -static void __init gds_select_mitigation(void); -static void __init gds_apply_mitigation(void); -static void __init bhi_select_mitigation(void); -static void __init bhi_update_mitigation(void); -static void __init bhi_apply_mitigation(void); -static void __init its_select_mitigation(void); -static void __init its_update_mitigation(void); -static void __init its_apply_mitigation(void); -static void __init tsa_select_mitigation(void); -static void __init tsa_apply_mitigation(void); -static void __init vmscape_select_mitigation(void); -static void __init vmscape_update_mitigation(void); -static void __init vmscape_apply_mitigation(void); - /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -233,99 +186,6 @@ static void __init cpu_print_attack_vectors(void) } } -void __init cpu_select_mitigations(void) -{ - /* - * Read the SPEC_CTRL MSR to account for reserved bits which may - * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD - * init code as it is not enumerated and depends on the family. - */ - if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - - /* - * Previously running kernel (kexec), may have some controls - * turned ON. Clear them and let the mitigations setup below - * rediscover them based on configuration. - */ - x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; - } - - x86_arch_cap_msr = x86_read_arch_cap_msr(); - - cpu_print_attack_vectors(); - - /* Select the proper CPU mitigations before patching alternatives: */ - spectre_v1_select_mitigation(); - spectre_v2_select_mitigation(); - retbleed_select_mitigation(); - spectre_v2_user_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - srbds_select_mitigation(); - l1d_flush_select_mitigation(); - srso_select_mitigation(); - gds_select_mitigation(); - its_select_mitigation(); - bhi_select_mitigation(); - tsa_select_mitigation(); - vmscape_select_mitigation(); - - /* - * After mitigations are selected, some may need to update their - * choices. - */ - spectre_v2_update_mitigation(); - /* - * retbleed_update_mitigation() relies on the state set by - * spectre_v2_update_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ - retbleed_update_mitigation(); - /* - * its_update_mitigation() depends on spectre_v2_update_mitigation() - * and retbleed_update_mitigation(). - */ - its_update_mitigation(); - - /* - * spectre_v2_user_update_mitigation() depends on - * retbleed_update_mitigation(), specifically the STIBP - * selection is forced for UNRET or IBPB. - */ - spectre_v2_user_update_mitigation(); - mds_update_mitigation(); - taa_update_mitigation(); - mmio_update_mitigation(); - rfds_update_mitigation(); - bhi_update_mitigation(); - /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ - srso_update_mitigation(); - vmscape_update_mitigation(); - - spectre_v1_apply_mitigation(); - spectre_v2_apply_mitigation(); - retbleed_apply_mitigation(); - spectre_v2_user_apply_mitigation(); - ssb_apply_mitigation(); - l1tf_apply_mitigation(); - mds_apply_mitigation(); - taa_apply_mitigation(); - mmio_apply_mitigation(); - rfds_apply_mitigation(); - srbds_apply_mitigation(); - srso_apply_mitigation(); - gds_apply_mitigation(); - its_apply_mitigation(); - bhi_apply_mitigation(); - tsa_apply_mitigation(); - vmscape_apply_mitigation(); -} - /* * NOTE: This function is *only* called for SVM, since Intel uses * MSR_IA32_SPEC_CTRL for SSBD. @@ -3369,6 +3229,99 @@ void cpu_bugs_smt_update(void) mutex_unlock(&spec_ctrl_mutex); } +void __init cpu_select_mitigations(void) +{ + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + + x86_arch_cap_msr = x86_read_arch_cap_msr(); + + cpu_print_attack_vectors(); + + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); + retbleed_select_mitigation(); + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); + srbds_select_mitigation(); + l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); + tsa_select_mitigation(); + vmscape_select_mitigation(); + + /* + * After mitigations are selected, some may need to update their + * choices. + */ + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + vmscape_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); + tsa_apply_mitigation(); + vmscape_apply_mitigation(); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71bb04e6a5bc..9aae990ed7c7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -79,6 +79,10 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Used for modules: built-in code uses runtime constants */ +unsigned long USER_PTR_MAX; +EXPORT_SYMBOL(USER_PTR_MAX); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ @@ -2580,7 +2584,7 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { - unsigned long USER_PTR_MAX = TASK_SIZE_MAX; + USER_PTR_MAX = TASK_SIZE_MAX; /* * Enable this when LAM is gated on LASS support diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index bc38b2d56f26..5c7a3a71191a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -42,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; #ifdef CONFIG_CPU_SUP_INTEL -enum tsx_ctrl_states { - TSX_CTRL_ENABLE, - TSX_CTRL_DISABLE, - TSX_CTRL_RTM_ALWAYS_ABORT, - TSX_CTRL_NOT_SUPPORTED, -}; - -extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; - extern void __init tsx_init(void); void tsx_ap_init(void); void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 46efcbd6afa4..a40f5545e25b 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, + { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, @@ -79,6 +80,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 }, { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d6906442f49b..3f1dda355307 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -43,9 +43,6 @@ /* Deferred error settings */ #define MSR_CU_DEF_ERR 0xC0000410 #define MASK_DEF_LVTOFF 0x000000F0 -#define MASK_DEF_INT_TYPE 0x00000006 -#define DEF_LVT_OFF 0x2 -#define DEF_INT_TYPE_APIC 0x2 /* Scalable MCA: */ @@ -54,6 +51,17 @@ static bool thresholding_irq_en; +struct mce_amd_cpu_data { + mce_banks_t thr_intr_banks; + mce_banks_t dfr_intr_banks; + + u32 thr_intr_en: 1, + dfr_intr_en: 1, + __resv: 30; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data); + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -79,6 +87,8 @@ struct smca_bank { const struct smca_hwid *hwid; u32 id; /* Value of MCA_IPID[InstanceId]. */ u8 sysfs_id; /* Value used for sysfs name. */ + u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */ + __reserved :63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); @@ -264,6 +274,7 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; static void smca_configure(unsigned int bank, unsigned int cpu) { + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); u8 *bank_counts = this_cpu_ptr(smca_bank_counts); const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; @@ -294,11 +305,33 @@ static void smca_configure(unsigned int bank, unsigned int cpu) * APIC based interrupt. First, check that no interrupt has been * set. */ - if ((low & BIT(5)) && !((high >> 5) & 0x3)) + if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) { + __set_bit(bank, data->dfr_intr_banks); high |= BIT(5); + } + + /* + * SMCA Corrected Error Interrupt + * + * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can + * send an MCA Thresholding interrupt without the OS initializing + * this feature. This can be used if the threshold limit is managed + * by the platform. + * + * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR). + * The OS should set this to inform the platform that the OS is ready + * to handle the MCA Thresholding interrupt. + */ + if ((low & BIT(10)) && data->thr_intr_en) { + __set_bit(bank, data->thr_intr_banks); + high |= BIT(8); + } this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + if (low & MCI_CONFIG_PADDRV) + this_cpu_ptr(smca_banks)[bank].paddrv = 1; + wrmsr(smca_config, low, high); } @@ -368,6 +401,14 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return false; + if (apic < 0) { pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, @@ -376,14 +417,6 @@ static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) } if (apic != msr) { - /* - * On SMCA CPUs, LVT offset is programmed at a different MSR, and - * the BIOS provides the value. The original field where LVT offset - * was set is reserved. Return early here: - */ - if (mce_flags.smca) - return false; - pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); @@ -443,6 +476,36 @@ static void threshold_restart_block(void *_tr) wrmsr(tr->b->address, lo, hi); } +static void threshold_restart_bank(unsigned int bank, bool intr_en) +{ + struct threshold_bank **thr_banks = this_cpu_read(threshold_banks); + struct threshold_block *block, *tmp; + struct thresh_restart tr; + + if (!thr_banks || !thr_banks[bank]) + return; + + memset(&tr, 0, sizeof(tr)); + + list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) { + tr.b = block; + tr.b->interrupt_enable = intr_en; + threshold_restart_block(&tr); + } +} + +/* Try to use the threshold limit reported through APEI. */ +static u16 get_thr_limit(void) +{ + u32 thr_limit = mce_get_apei_thr_limit(); + + /* Fallback to old default if APEI limit is not available. */ + if (!thr_limit) + return THRESHOLD_MAX; + + return min(thr_limit, THRESHOLD_MAX); +} + static void mce_threshold_block_init(struct threshold_block *b, int offset) { struct thresh_restart tr = { @@ -451,7 +514,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) .lvt_off = offset, }; - b->threshold_limit = THRESHOLD_MAX; + b->threshold_limit = get_thr_limit(); threshold_restart_block(&tr); }; @@ -464,41 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new) return reserved; } -static int setup_APIC_deferred_error(int reserved, int new) -{ - if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, - APIC_EILVT_MSG_FIX, 0)) - return new; - - return reserved; -} - -static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) -{ - u32 low = 0, high = 0; - int def_offset = -1, def_new; - - if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) - return; - - def_new = (low & MASK_DEF_LVTOFF) >> 4; - if (!(low & MASK_DEF_LVTOFF)) { - pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); - def_new = DEF_LVT_OFF; - low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); - } - - def_offset = setup_APIC_deferred_error(def_offset, def_new); - if ((def_offset == def_new) && - (deferred_error_int_vector != amd_deferred_error_interrupt)) - deferred_error_int_vector = amd_deferred_error_interrupt; - - if (!mce_flags.smca) - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; - - wrmsr(MSR_CU_DEF_ERR, low, high); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -534,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, return addr; } -static int -prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, - int offset, u32 misc_high) +static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high; struct threshold_block b; int new; @@ -556,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, if (!b.interrupt_capable) goto done; + __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks); b.interrupt_enable = 1; - if (!mce_flags.smca) { - new = (misc_high & MASK_LVTOFF_HI) >> 20; - goto set_offset; - } - - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; - - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + if (mce_flags.smca) + goto done; -set_offset: + new = (misc_high & MASK_LVTOFF_HI) >> 20; offset = setup_APIC_mce_threshold(offset, new); if (offset == new) thresholding_irq_en = true; @@ -577,7 +596,6 @@ set_offset: done: mce_threshold_block_init(&b, offset); -out: return offset; } @@ -668,6 +686,32 @@ static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) mce_banks[0].ctl = 0; } +/* + * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is + * ready to send interrupts. + * + * Individual error sources are enabled later during per-bank init. + */ +static void smca_enable_interrupt_vectors(void) +{ + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); + u64 mca_intr_cfg, offset; + + if (!mce_flags.smca || !mce_flags.succor) + return; + + if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg)) + return; + + offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12; + if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->thr_intr_en = 1; + + offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4; + if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->dfr_intr_en = 1; +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -679,10 +723,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) mce_flags.amd_threshold = 1; + smca_enable_interrupt_vectors(); + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (mce_flags.smca) + if (mce_flags.smca) { smca_configure(bank, cpu); + if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en) + continue; + } + disable_err_thresholding(c, bank); for (block = 0; block < NR_BLOCKS; ++block) { @@ -703,9 +753,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) offset = prepare_threshold_block(bank, block, address, offset, high); } } - - if (mce_flags.succor) - deferred_error_interrupt_enable(c); } void smca_bsp_init(void) @@ -748,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m) } /* - * AMD systems do not have an explicit indicator that the value in MCA_ADDR is - * a system physical address. Therefore, individual cases need to be detected. - * Future cases and checks will be added as needed. + * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a + * system physical address. Individual cases though, need to be detected for + * other systems. Future cases will be added as needed. * * 1) General case * a) Assume address is not usable. @@ -764,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m) * a) Reported in legacy bank 4 with extended error code (XEC) 8. * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, * this bit should not be checked. + * 4) MCI_STATUS_PADDRVAL is set + * a) Will provide a valid system physical address. * * NOTE: SMCA UMC memory errors fall into case #1. */ @@ -777,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m) return false; } + if (this_cpu_ptr(smca_banks)[m->bank].paddrv) + return m->status & MCI_STATUS_PADDRV; + /* Check poison bit for all other bank types. */ if (m->status & MCI_STATUS_POISON) return true; @@ -785,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m) return false; } -static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) -{ - struct mce_hw_err err; - struct mce *m = &err.m; - - mce_prep_record(&err); - - m->status = status; - m->misc = misc; - m->bank = bank; - m->tsc = rdtsc(); - - if (m->status & MCI_STATUS_ADDRV) { - m->addr = addr; - - smca_extract_err_addr(m); - } - - if (mce_flags.smca) { - rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - - if (m->status & MCI_STATUS_SYNDV) { - rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); - } - } - - mce_log(&err); -} - DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -825,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) apic_eoi(); } -/* - * Returns true if the logged error is deferred. False, otherwise. - */ -static inline bool -_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) -{ - u64 status, addr = 0; - - rdmsrq(msr_stat, status); - if (!(status & MCI_STATUS_VAL)) - return false; - - if (status & MCI_STATUS_ADDRV) - rdmsrq(msr_addr, addr); - - __log_error(bank, status, addr, misc); - - wrmsrq(msr_stat, 0); - - return status & MCI_STATUS_DEFERRED; -} - -static bool _log_error_deferred(unsigned int bank, u32 misc) -{ - if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), misc)) - return false; - - /* - * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. - * Return true here to avoid accessing these registers. - */ - if (!mce_flags.smca) - return true; - - /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); - return true; -} - -/* - * We have three scenarios for checking for Deferred errors: - * - * 1) Non-SMCA systems check MCA_STATUS and log error if found. - * 2) SMCA systems check MCA_STATUS. If error is found then log it and also - * clear MCA_DESTAT. - * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and - * log it. - */ -static void log_error_deferred(unsigned int bank) -{ - if (_log_error_deferred(bank, 0)) - return; - - /* - * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check - * for a valid error. - */ - _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), - MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); -} - /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { - unsigned int bank; - - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) - log_error_deferred(bank); + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks); } -static void log_error_thresholding(unsigned int bank, u64 misc) +void mce_amd_handle_storm(unsigned int bank, bool on) { - _log_error_deferred(bank, misc); + threshold_restart_bank(bank, on); } -static void log_and_reset_block(struct threshold_block *block) +static void amd_reset_thr_limit(unsigned int bank) { - struct thresh_restart tr; - u32 low = 0, high = 0; - - if (!block) - return; - - if (rdmsr_safe(block->address, &low, &high)) - return; - - if (!(high & MASK_OVERFLOW_HI)) - return; - - /* Log the MCE which caused the threshold event. */ - log_error_thresholding(block->bank, ((u64)high << 32) | low); - - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = block; - threshold_restart_block(&tr); + threshold_restart_bank(bank, true); } /* @@ -930,33 +868,21 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank; - unsigned int bank, cpu = smp_processor_id(); - struct threshold_block *block, *tmp; - - /* - * Validate that the threshold bank has been initialized already. The - * handler is installed at boot time, but on a hotplug event the - * interrupt might fire before the data has been initialized. - */ - if (!bp) - return; - - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) - continue; - - thr_bank = bp[bank]; - if (!thr_bank) - continue; - - list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj) - log_and_reset_block(block); - } + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks); } void amd_clear_bank(struct mce *m) { + amd_reset_thr_limit(m->bank); + + /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */ + if (m->status & MCI_STATUS_DEFERRED) + mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0); + + /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */ + if (m->kflags & MCE_CHECK_DFR_REGS) + return; + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); } @@ -1172,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb b->address = address; b->interrupt_enable = 0; b->interrupt_capable = lvt_interrupt_supported(bank, high); - b->threshold_limit = THRESHOLD_MAX; + b->threshold_limit = get_thr_limit(); if (b->interrupt_capable) { default_attrs[2] = &interrupt_enable.attr; @@ -1183,6 +1109,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb list_add(&b->miscj, &tb->miscj); + mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 460e90a1a0b1..4aff14e04287 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); + if (m->kflags & MCE_CHECK_DFR_REGS) + m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i)); + else + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -715,6 +718,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static bool smca_should_log_poll_error(struct mce *m) +{ + if (m->status & MCI_STATUS_VAL) + return true; + + m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank)); + if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) { + m->kflags |= MCE_CHECK_DFR_REGS; + return true; + } + + return false; +} + +/* * Newer Intel systems that support software error * recovery need to make additional checks. Other * CPUs should skip over uncorrected errors, but log @@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) { struct mce *m = &err->m; + if (mce_flags.smca) + return smca_should_log_poll_error(m); + /* If this entry is not valid, ignore it. */ if (!(m->status & MCI_STATUS_VAL)) return false; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b0e00ec5cc8c..a31cf984619c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce); void mce_inherit_storm(unsigned int bank); bool mce_get_storm_mode(void); void mce_set_storm_mode(bool storm); +u32 mce_get_apei_thr_limit(void); #else static inline void cmci_storm_begin(unsigned int bank) {} static inline void cmci_storm_end(unsigned int bank) {} @@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {} static inline void mce_inherit_storm(unsigned int bank) {} static inline bool mce_get_storm_mode(void) { return false; } static inline void mce_set_storm_mode(bool storm) {} +static inline u32 mce_get_apei_thr_limit(void) { return 0; } #endif /* @@ -267,6 +269,7 @@ void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD void mce_threshold_create_device(unsigned int cpu); void mce_threshold_remove_device(unsigned int cpu); +void mce_amd_handle_storm(unsigned int bank, bool on); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); void amd_clear_bank(struct mce *m); @@ -299,6 +302,7 @@ void smca_bsp_init(void); #else static inline void mce_threshold_create_device(unsigned int cpu) { } static inline void mce_threshold_remove_device(unsigned int cpu) { } +static inline void mce_amd_handle_storm(unsigned int bank, bool on) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } static inline void amd_clear_bank(struct mce *m) { } diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index f4a007616468..0d13c9ffcba0 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -13,6 +13,19 @@ #include "internal.h" +static u32 mce_apei_thr_limit; + +void mce_save_apei_thr_limit(u32 thr_limit) +{ + mce_apei_thr_limit = thr_limit; + pr_info("HEST corrected error threshold limit: %u\n", thr_limit); +} + +u32 mce_get_apei_thr_limit(void) +{ + return mce_apei_thr_limit; +} + static void default_threshold_interrupt(void) { pr_err("Unexpected threshold interrupt at vector %x\n", @@ -63,6 +76,9 @@ static void mce_handle_storm(unsigned int bank, bool on) case X86_VENDOR_INTEL: mce_intel_handle_storm(bank, on); break; + case X86_VENDOR_AMD: + mce_amd_handle_storm(bank, on); + break; } } @@ -85,7 +101,8 @@ void cmci_storm_end(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); - __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + if (!mce_flags.amd_threshold) + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); storm->banks[bank].history = 0; storm->banks[bank].in_storm_mode = false; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index b7c797dc94f4..3821a985f4ff 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -186,47 +186,61 @@ static u32 cpuid_to_ucode_rev(unsigned int val) return p.ucode_rev; } +static u32 get_cutoff_revision(u32 rev) +{ + switch (rev >> 8) { + case 0x80012: return 0x8001277; break; + case 0x80082: return 0x800820f; break; + case 0x83010: return 0x830107c; break; + case 0x86001: return 0x860010e; break; + case 0x86081: return 0x8608108; break; + case 0x87010: return 0x8701034; break; + case 0x8a000: return 0x8a0000a; break; + case 0xa0010: return 0xa00107a; break; + case 0xa0011: return 0xa0011da; break; + case 0xa0012: return 0xa001243; break; + case 0xa0082: return 0xa00820e; break; + case 0xa1011: return 0xa101153; break; + case 0xa1012: return 0xa10124e; break; + case 0xa1081: return 0xa108109; break; + case 0xa2010: return 0xa20102f; break; + case 0xa2012: return 0xa201212; break; + case 0xa4041: return 0xa404109; break; + case 0xa5000: return 0xa500013; break; + case 0xa6012: return 0xa60120a; break; + case 0xa7041: return 0xa704109; break; + case 0xa7052: return 0xa705208; break; + case 0xa7080: return 0xa708009; break; + case 0xa70c0: return 0xa70C009; break; + case 0xaa001: return 0xaa00116; break; + case 0xaa002: return 0xaa00218; break; + case 0xb0021: return 0xb002146; break; + case 0xb0081: return 0xb008111; break; + case 0xb1010: return 0xb101046; break; + case 0xb2040: return 0xb204031; break; + case 0xb4040: return 0xb404031; break; + case 0xb4041: return 0xb404101; break; + case 0xb6000: return 0xb600031; break; + case 0xb6080: return 0xb608031; break; + case 0xb7000: return 0xb700031; break; + default: break; + + } + return 0; +} + static bool need_sha_check(u32 cur_rev) { + u32 cutoff; + if (!cur_rev) { cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev); } - switch (cur_rev >> 8) { - case 0x80012: return cur_rev <= 0x8001277; break; - case 0x80082: return cur_rev <= 0x800820f; break; - case 0x83010: return cur_rev <= 0x830107c; break; - case 0x86001: return cur_rev <= 0x860010e; break; - case 0x86081: return cur_rev <= 0x8608108; break; - case 0x87010: return cur_rev <= 0x8701034; break; - case 0x8a000: return cur_rev <= 0x8a0000a; break; - case 0xa0010: return cur_rev <= 0xa00107a; break; - case 0xa0011: return cur_rev <= 0xa0011da; break; - case 0xa0012: return cur_rev <= 0xa001243; break; - case 0xa0082: return cur_rev <= 0xa00820e; break; - case 0xa1011: return cur_rev <= 0xa101153; break; - case 0xa1012: return cur_rev <= 0xa10124e; break; - case 0xa1081: return cur_rev <= 0xa108109; break; - case 0xa2010: return cur_rev <= 0xa20102f; break; - case 0xa2012: return cur_rev <= 0xa201212; break; - case 0xa4041: return cur_rev <= 0xa404109; break; - case 0xa5000: return cur_rev <= 0xa500013; break; - case 0xa6012: return cur_rev <= 0xa60120a; break; - case 0xa7041: return cur_rev <= 0xa704109; break; - case 0xa7052: return cur_rev <= 0xa705208; break; - case 0xa7080: return cur_rev <= 0xa708009; break; - case 0xa70c0: return cur_rev <= 0xa70C009; break; - case 0xaa001: return cur_rev <= 0xaa00116; break; - case 0xaa002: return cur_rev <= 0xaa00218; break; - case 0xb0021: return cur_rev <= 0xb002146; break; - case 0xb1010: return cur_rev <= 0xb101046; break; - case 0xb2040: return cur_rev <= 0xb204031; break; - case 0xb4040: return cur_rev <= 0xb404031; break; - case 0xb6000: return cur_rev <= 0xb600031; break; - case 0xb7000: return cur_rev <= 0xb700031; break; - default: break; - } + cutoff = get_cutoff_revision(cur_rev); + if (cutoff) + return cur_rev <= cutoff; pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n"); pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); @@ -491,6 +505,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; + u32 cur_rev, cutoff, patch_rev; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -530,11 +545,32 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); - ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); - if (patch_fam != family) return 1; + cur_rev = get_patch_level(); + + /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */ + cutoff = get_cutoff_revision(cur_rev); + if (!cutoff) + goto ok; + + patch_rev = mc_hdr->patch_id; + + ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n", + cur_rev, cutoff, patch_rev); + + if (cur_rev <= cutoff && patch_rev <= cutoff) + goto ok; + + if (cur_rev > cutoff && patch_rev > cutoff) + goto ok; + + return 1; + +ok: + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + return 0; } @@ -603,8 +639,6 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - ucode_dbg("patch_id: 0x%x\n", mc->hdr.patch_id); - if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f75c140906d0..ccc83b0bf63c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -136,7 +136,7 @@ bool __init microcode_loader_disabled(void) return dis_ucode_ldr; } -static void early_parse_cmdline(void) +static void __init early_parse_cmdline(void) { char cmd_buf[64] = {}; char *s, *p = cmd_buf; @@ -589,6 +589,17 @@ static int load_late_stop_cpus(bool is_safe) pr_err("You should switch to early loading, if possible.\n"); } + /* + * Pre-load the microcode image into a staging device. This + * process is preemptible and does not require stopping CPUs. + * Successful staging simplifies the subsequent late-loading + * process, reducing rendezvous time. + * + * Even if the transfer fails, the update will proceed as usual. + */ + if (microcode_ops->use_staging) + microcode_ops->stage_microcode(); + atomic_set(&late_cpus_in, num_online_cpus()); atomic_set(&offline_in_nmi, 0); loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 371ca6eac00e..8744f3adc2a0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -13,12 +13,15 @@ #define pr_fmt(fmt) "microcode: " fmt #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/kernel.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/uio.h> +#include <linux/io.h> #include <linux/mm.h> #include <asm/cpu_device_id.h> @@ -33,6 +36,38 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; #define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL) +/* Defines for the microcode staging mailbox interface */ +#define MBOX_REG_NUM 4 +#define MBOX_REG_SIZE sizeof(u32) + +#define MBOX_CONTROL_OFFSET 0x0 +#define MBOX_STATUS_OFFSET 0x4 +#define MBOX_WRDATA_OFFSET 0x8 +#define MBOX_RDDATA_OFFSET 0xc + +#define MASK_MBOX_CTRL_ABORT BIT(0) +#define MASK_MBOX_CTRL_GO BIT(31) + +#define MASK_MBOX_STATUS_ERROR BIT(2) +#define MASK_MBOX_STATUS_READY BIT(31) + +#define MASK_MBOX_RESP_SUCCESS BIT(0) +#define MASK_MBOX_RESP_PROGRESS BIT(1) +#define MASK_MBOX_RESP_ERROR BIT(2) + +#define MBOX_CMD_LOAD 0x3 +#define MBOX_OBJ_STAGING 0xb +#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \ + (MBOX_OBJ_STAGING << 16) | \ + ((u64)((size) / sizeof(u32)) << 32)) + +/* The size of each mailbox header */ +#define MBOX_HEADER_SIZE sizeof(u64) +/* The size of staging hardware response */ +#define MBOX_RESPONSE_SIZE sizeof(u64) + +#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC) + /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *ucode_patch_va __read_mostly; static struct microcode_intel *ucode_patch_late __read_mostly; @@ -54,6 +89,23 @@ struct extended_sigtable { struct extended_signature sigs[]; }; +/** + * struct staging_state - Track the current staging process state + * + * @mmio_base: MMIO base address for staging + * @ucode_len: Total size of the microcode image + * @chunk_size: Size of each data piece + * @bytes_sent: Total bytes transmitted so far + * @offset: Current offset in the microcode image + */ +struct staging_state { + void __iomem *mmio_base; + unsigned int ucode_len; + unsigned int chunk_size; + unsigned int bytes_sent; + unsigned int offset; +}; + #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) @@ -299,6 +351,298 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size, return size ? NULL : patch; } +static inline u32 read_mbox_dword(void __iomem *mmio_base) +{ + u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET); + + /* Acknowledge read completion to the staging hardware */ + writel(0, mmio_base + MBOX_RDDATA_OFFSET); + return dword; +} + +static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword) +{ + writel(dword, mmio_base + MBOX_WRDATA_OFFSET); +} + +static inline u64 read_mbox_header(void __iomem *mmio_base) +{ + u32 high, low; + + low = read_mbox_dword(mmio_base); + high = read_mbox_dword(mmio_base); + + return ((u64)high << 32) | low; +} + +static inline void write_mbox_header(void __iomem *mmio_base, u64 value) +{ + write_mbox_dword(mmio_base, value); + write_mbox_dword(mmio_base, value >> 32); +} + +static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes) +{ + int i; + + /* + * The MMIO space is mapped as Uncached (UC). Each write arrives + * at the device as an individual transaction in program order. + * The device can then reassemble the sequence accordingly. + */ + for (i = 0; i < chunk_bytes / sizeof(u32); i++) + write_mbox_dword(mmio_base, chunk[i]); +} + +/* + * Prepare for a new microcode transfer: reset hardware and record the + * image size. + */ +static void init_stage(struct staging_state *ss) +{ + ss->ucode_len = get_totalsize(&ucode_patch_late->hdr); + + /* + * Abort any ongoing process, effectively resetting the device. + * Unlike regular mailbox data processing requests, this + * operation does not require a status check. + */ + writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET); +} + +/* + * Update the chunk size and decide whether another chunk can be sent. + * This accounts for remaining data and retry limits. + */ +static bool can_send_next_chunk(struct staging_state *ss, int *err) +{ + /* A page size or remaining bytes if this is the final chunk */ + ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset); + + /* + * Each microcode image is divided into chunks, each at most + * one page size. A 10-chunk image would typically require 10 + * transactions. + * + * However, the hardware managing the mailbox has limited + * resources and may not cache the entire image, potentially + * requesting the same chunk multiple times. + * + * To tolerate this behavior, allow up to twice the expected + * number of transactions (i.e., a 10-chunk image can take up to + * 20 attempts). + * + * If the number of attempts exceeds this limit, treat it as + * exceeding the maximum allowed transfer size. + */ + if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) { + *err = -EMSGSIZE; + return false; + } + + *err = 0; + return true; +} + +/* + * The hardware indicates completion by returning a sentinel end offset. + */ +static inline bool is_end_offset(u32 offset) +{ + return offset == UINT_MAX; +} + +/* + * Determine whether staging is complete: either the hardware signaled + * the end offset, or no more transactions are permitted (retry limit + * reached). + */ +static inline bool staging_is_complete(struct staging_state *ss, int *err) +{ + return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err); +} + +/* + * Wait for the hardware to complete a transaction. + * Return 0 on success, or an error code on failure. + */ +static int wait_for_transaction(struct staging_state *ss) +{ + u32 timeout, status; + + /* Allow time for hardware to complete the operation: */ + for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) { + msleep(1); + + status = readl(ss->mmio_base + MBOX_STATUS_OFFSET); + /* Break out early if the hardware is ready: */ + if (status & MASK_MBOX_STATUS_READY) + break; + } + + /* Check for explicit error response */ + if (status & MASK_MBOX_STATUS_ERROR) + return -EIO; + + /* + * Hardware has neither responded to the action nor signaled any + * error. Treat this as a timeout. + */ + if (!(status & MASK_MBOX_STATUS_READY)) + return -ETIMEDOUT; + + return 0; +} + +/* + * Transmit a chunk of the microcode image to the hardware. + * Return 0 on success, or an error code on failure. + */ +static int send_data_chunk(struct staging_state *ss, void *ucode_ptr) +{ + u32 *src_chunk = ucode_ptr + ss->offset; + u16 mbox_size; + + /* + * Write a 'request' mailbox object in this order: + * 1. Mailbox header includes total size + * 2. Command header specifies the load operation + * 3. Data section contains a microcode chunk + * + * Thus, the mailbox size is two headers plus the chunk size. + */ + mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size; + write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size)); + write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD); + write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size); + ss->bytes_sent += ss->chunk_size; + + /* Notify the hardware that the mailbox is ready for processing. */ + writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET); + + return wait_for_transaction(ss); +} + +/* + * Retrieve the next offset from the hardware response. + * Return 0 on success, or an error code on failure. + */ +static int fetch_next_offset(struct staging_state *ss) +{ + const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE); + u32 offset, status; + u64 header; + + /* + * The 'response' mailbox returns three fields, in order: + * 1. Header + * 2. Next offset in the microcode image + * 3. Status flags + */ + header = read_mbox_header(ss->mmio_base); + offset = read_mbox_dword(ss->mmio_base); + status = read_mbox_dword(ss->mmio_base); + + /* All valid responses must start with the expected header. */ + if (header != expected_header) { + pr_err_once("staging: invalid response header (0x%llx)\n", header); + return -EBADR; + } + + /* + * Verify the offset: If not at the end marker, it must not + * exceed the microcode image length. + */ + if (!is_end_offset(offset) && offset > ss->ucode_len) { + pr_err_once("staging: invalid offset (%u) past the image end (%u)\n", + offset, ss->ucode_len); + return -EINVAL; + } + + /* Hardware may report errors explicitly in the status field */ + if (status & MASK_MBOX_RESP_ERROR) + return -EPROTO; + + ss->offset = offset; + return 0; +} + +/* + * Handle the staging process using the mailbox MMIO interface. The + * microcode image is transferred in chunks until completion. + * Return 0 on success or an error code on failure. + */ +static int do_stage(u64 mmio_pa) +{ + struct staging_state ss = {}; + int err; + + ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE); + if (WARN_ON_ONCE(!ss.mmio_base)) + return -EADDRNOTAVAIL; + + init_stage(&ss); + + /* Perform the staging process while within the retry limit */ + while (!staging_is_complete(&ss, &err)) { + /* Send a chunk of microcode each time: */ + err = send_data_chunk(&ss, ucode_patch_late); + if (err) + break; + /* + * Then, ask the hardware which piece of the image it + * needs next. The same piece may be sent more than once. + */ + err = fetch_next_offset(&ss); + if (err) + break; + } + + iounmap(ss.mmio_base); + + return err; +} + +static void stage_microcode(void) +{ + unsigned int pkg_id = UINT_MAX; + int cpu, err; + u64 mmio_pa; + + if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) { + pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n", + get_totalsize(&ucode_patch_late->hdr)); + return; + } + + lockdep_assert_cpus_held(); + + /* + * The MMIO address is unique per package, and all the SMT + * primary threads are online here. Find each MMIO space by + * their package IDs to avoid duplicate staging. + */ + for_each_cpu(cpu, cpu_primary_thread_mask) { + if (topology_logical_package_id(cpu) == pkg_id) + continue; + + pkg_id = topology_logical_package_id(cpu); + + err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa); + if (WARN_ON_ONCE(err)) + return; + + err = do_stage(mmio_pa); + if (err) { + pr_err("Error: staging failed (%d) for CPU%d at package %u.\n", + err, cpu, pkg_id); + return; + } + } + + pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev); +} + static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, struct microcode_intel *mc, u32 *cur_rev) @@ -627,6 +971,7 @@ static struct microcode_ops microcode_intel_ops = { .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_late, .finalize_late_load = finalize_late_load, + .stage_microcode = stage_microcode, .use_nmi = IS_ENABLED(CONFIG_X86_64), }; @@ -638,6 +983,18 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) llc_size_per_core = (unsigned int)llc_size; } +static __init bool staging_available(void) +{ + u64 val; + + val = x86_read_arch_cap_msr(); + if (!(val & ARCH_CAP_MCU_ENUM)) + return false; + + rdmsrq(MSR_IA32_MCU_ENUMERATION, val); + return !!(val & MCU_STAGING); +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -648,6 +1005,11 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + if (staging_available()) { + microcode_intel_ops.use_staging = true; + pr_info("Enabled staging feature.\n"); + } + calc_llc_size_per_core(c); return µcode_intel_ops; diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index ae8dbc2b908d..a10b547eda1e 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -31,10 +31,12 @@ struct microcode_ops { * See also the "Synchronization" section in microcode_core.c. */ enum ucode_state (*apply_microcode)(int cpu); + void (*stage_microcode)(void); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); void (*finalize_late_load)(int result); unsigned int nmi_safe : 1, - use_nmi : 1; + use_nmi : 1, + use_staging : 1; }; struct early_load_data { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5655f253d929..2de3bd2f95d1 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -46,10 +46,6 @@ struct set_mtrr_context { u32 ccr3; }; -void set_mtrr_done(struct set_mtrr_context *ctxt); -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); - void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 06ca5a30140c..3792ab4819dc 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -274,6 +274,11 @@ static void rdt_get_cdp_config(int level) rdt_resources_all[level].r_resctrl.cdp_capable = true; } +static void rdt_set_io_alloc_capable(struct rdt_resource *r) +{ + r->cache.io_alloc_capable = true; +} + static void rdt_get_cdp_l3_config(void) { rdt_get_cdp_config(RDT_RESOURCE_L3); @@ -719,6 +724,7 @@ enum { RDT_FLAG_SMBA, RDT_FLAG_BMEC, RDT_FLAG_ABMC, + RDT_FLAG_SDCIAE, }; #define RDT_OPT(idx, n, f) \ @@ -745,6 +751,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), + RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -853,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void) rdt_get_cache_alloc_cfg(1, r); if (rdt_cpu_has(X86_FEATURE_CDP_L3)) rdt_get_cdp_l3_config(); + if (rdt_cpu_has(X86_FEATURE_SDCIAE)) + rdt_set_io_alloc_capable(r); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1189c0df4ad7..b20e705606b8 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->sdciae_enabled; +} + +static void resctrl_sdciae_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); +} + +static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_ctrl_domain *d; + + /* Walking r->ctrl_domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */ + list_for_each_entry(d, &r->ctrl_domains, hdr.list) + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1); +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (hw_res->r_resctrl.cache.io_alloc_capable && + hw_res->sdciae_enabled != enable) { + _resctrl_sdciae_enable(r, enable); + hw_res->sdciae_enabled = enable; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 9f4c2f0aaf5c..4a916c84a322 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -46,6 +46,9 @@ struct arch_mbm_state { #define ABMC_EXTENDED_EVT_ID BIT(31) #define ABMC_EVT_ID BIT(0) +/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */ +#define SDCIAE_ENABLE_BIT 1 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -112,6 +115,7 @@ struct msr_param { * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource * @mbm_cntr_assign_enabled: ABMC feature is enabled + * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled. * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -126,6 +130,7 @@ struct rdt_hw_resource { unsigned int mbm_width; bool cdp_enabled; bool mbm_cntr_assign_enabled; + bool sdciae_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fe1a2aa53c16..dffcc8307500 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -361,6 +361,7 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0), {} }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c21..0524ac0260fc 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 }, { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, @@ -53,6 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, + { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 7f8d1e11dbee..79d6020dfe9c 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask; u64 sgx_xfrm_reserved_mask = ~0x3; u32 sgx_misc_reserved_mask; -static int sgx_open(struct inode *inode, struct file *file) +static int __sgx_open(struct inode *inode, struct file *file) { struct sgx_encl *encl; int ret; @@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file) return 0; } +static int sgx_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static int sgx_release(struct inode *inode, struct file *file) { struct sgx_encl *encl = file->private_data; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 308dbbae6c6e..cf149b9f4916 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref) WARN_ON_ONCE(encl->secs.epc_page); kfree(encl); + sgx_dec_usage_count(); } /* diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 42a088a337c5..74be751199a4 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) return __encls_2(EAUG, pginfo, addr); } +/* Attempt to update CPUSVN at runtime. */ +static inline int __eupdatesvn(void) +{ + return __encls_ret_1(EUPDATESVN, ""); +} #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index fc8fb64d62f4..dc73194416ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -17,6 +17,7 @@ #include <linux/vmalloc.h> #include <asm/msr.h> #include <asm/sgx.h> +#include <asm/archrandom.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -918,6 +919,106 @@ int sgx_set_attribute(unsigned long *allowed_attributes, } EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); +/* Counter to count the active SGX users */ +static int sgx_usage_count; + +/** + * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN]. + * + * This instruction attempts to update CPUSVN to the + * currently loaded microcode update SVN and generate new + * cryptographic assets. + * + * Return: + * * %0: - Success or not supported + * * %-EAGAIN: - Can be safely retried, failure is due to lack of + * * entropy in RNG + * * %-EIO: - Unexpected error, retries are not advisable + */ +static int sgx_update_svn(void) +{ + int ret; + + /* + * If EUPDATESVN is not available, it is ok to + * silently skip it to comply with legacy behavior. + */ + if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN)) + return 0; + + /* + * EPC is guaranteed to be empty when there are no users. + * Ensure we are on our first user before proceeding further. + */ + WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n"); + + for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) { + ret = __eupdatesvn(); + + /* Stop on success or unexpected errors: */ + if (ret != SGX_INSUFFICIENT_ENTROPY) + break; + } + + switch (ret) { + case 0: + /* + * SVN successfully updated. + * Let users know when the update was successful. + */ + pr_info("SVN updated successfully\n"); + return 0; + case SGX_NO_UPDATE: + /* + * SVN update failed since the current SVN is + * not newer than CPUSVN. This is the most + * common case and indicates no harm. + */ + return 0; + case SGX_INSUFFICIENT_ENTROPY: + /* + * SVN update failed due to lack of entropy in DRNG. + * Indicate to userspace that it should retry. + */ + return -EAGAIN; + default: + break; + } + + /* + * EUPDATESVN was called when EPC is empty, all other error + * codes are unexpected. + */ + ENCLS_WARN(ret, "EUPDATESVN"); + return -EIO; +} + +/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */ +static DEFINE_MUTEX(sgx_svn_lock); + +int sgx_inc_usage_count(void) +{ + int ret; + + guard(mutex)(&sgx_svn_lock); + + if (!sgx_usage_count) { + ret = sgx_update_svn(); + if (ret) + return ret; + } + + sgx_usage_count++; + + return 0; +} + +void sgx_dec_usage_count(void) +{ + guard(mutex)(&sgx_svn_lock); + sgx_usage_count--; +} + static int __init sgx_init(void) { int ret; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index d2dad21259a8..f5940393d9bd 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void) } #endif +int sgx_inc_usage_count(void); +void sgx_dec_usage_count(void); + void sgx_update_lepubkeyhash(u64 *lepubkeyhash); #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 727f2570c8b9..8de1f1a755f2 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -256,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) xa_destroy(&vepc->page_array); kfree(vepc); + sgx_dec_usage_count(); return 0; } -static int sgx_vepc_open(struct inode *inode, struct file *file) +static int __sgx_vepc_open(struct inode *inode, struct file *file) { struct sgx_vepc *vepc; @@ -274,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_vepc_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static long sgx_vepc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 6073a16628f9..f55ea3cdbf88 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -75,15 +75,11 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) return phys_id == (u64)cpuid_to_apicid[cpu]; } -#ifdef CONFIG_SMP static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { if (!(apicid & (__max_threads_per_core - 1))) cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif /* * Convert the APIC ID to a domain level ID by masking out the low bits diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index b5a5e1411469..71625795d711 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -16,6 +16,9 @@ EXPORT_SYMBOL_GPL(x86_topo_system); unsigned int __amd_nodes_per_pkg __ro_after_init; EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, unsigned int shift, unsigned int ncpus) { diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 49782724a943..209b5a22d880 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -19,7 +19,17 @@ #undef pr_fmt #define pr_fmt(fmt) "tsx: " fmt -enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; +enum tsx_ctrl_states { + TSX_CTRL_AUTO, + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, + TSX_CTRL_NOT_SUPPORTED, +}; + +static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO : + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE; static void tsx_disable(void) { @@ -156,11 +166,28 @@ static void tsx_dev_mode_disable(void) } } -void __init tsx_init(void) +static int __init tsx_parse_cmdline(char *str) { - char arg[5] = {}; - int ret; + if (!str) + return -EINVAL; + + if (!strcmp(str, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(str, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(str, "auto")) { + tsx_ctrl_state = TSX_CTRL_AUTO; + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + + return 0; +} +early_param("tsx", tsx_parse_cmdline); +void __init tsx_init(void) +{ tsx_dev_mode_disable(); /* @@ -194,27 +221,8 @@ void __init tsx_init(void) return; } - ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); - if (ret >= 0) { - if (!strcmp(arg, "on")) { - tsx_ctrl_state = TSX_CTRL_ENABLE; - } else if (!strcmp(arg, "off")) { - tsx_ctrl_state = TSX_CTRL_DISABLE; - } else if (!strcmp(arg, "auto")) { - tsx_ctrl_state = x86_get_tsx_auto_mode(); - } else { - tsx_ctrl_state = TSX_CTRL_DISABLE; - pr_err("invalid option, defaulting to off\n"); - } - } else { - /* tsx= not provided */ - if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) - tsx_ctrl_state = x86_get_tsx_auto_mode(); - else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) - tsx_ctrl_state = TSX_CTRL_DISABLE; - else - tsx_ctrl_state = TSX_CTRL_ENABLE; - } + if (tsx_ctrl_state == TSX_CTRL_AUTO) + tsx_ctrl_state = x86_get_tsx_auto_mode(); if (tsx_ctrl_state == TSX_CTRL_DISABLE) { tsx_disable(); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 71ee20102a8a..b10684dedc58 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -181,8 +181,8 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * in false positive reports. Disable instrumentation to avoid those. */ __no_kmsan_checks -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, const char *log_lvl) +static void __show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; @@ -303,6 +303,25 @@ next: } } +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) +{ + /* + * Disable KASAN to avoid false positives during walking another + * task's stacks, as values on these stacks may change concurrently + * with task execution. + */ + bool disable_kasan = task && task != current; + + if (disable_kasan) + kasan_disable_current(); + + __show_trace_log_lvl(task, regs, stack, log_lvl); + + if (disable_kasan) + kasan_enable_current(); +} + void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 367da3638167..823dbdd0eb41 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR + /* Restore return_to_handler value that got eaten by previous ret instruction. */ + subq $8, %rsp + UNWIND_HINT_FUNC + /* Save ftrace_regs for function exit context */ subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rdx, RDX(%rsp) movq %rbp, RBP(%rsp) + movq %rsp, RSP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler @@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler) movq RDX(%rsp), %rdx movq RAX(%rsp), %rax - addq $(FRAME_SIZE), %rsp + addq $(FRAME_SIZE) + 8, %rsp + /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3863d7709386..c1fac3a9fecc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; insn_byte_t prefix; - int i; if (search_exception_tables((unsigned long)addr)) return false; /* Page fault may occur on this address. */ @@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return false; - for_each_insn_prefix(insn, i, prefix) { + for_each_insn_prefix(insn, prefix) { insn_attr_t attr; attr = inat_get_opcode_attribute(prefix); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 0aabd4c4e2c4..6f826a00eca2 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -103,7 +103,6 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) asm ( ".pushsection .rodata\n" - "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -160,9 +159,6 @@ asm ( "optprobe_template_end:\n" ".popsection\n"); -void optprobe_template_func(void); -STACK_FRAME_NON_STANDARD(optprobe_template_func); - #define TMPL_CLAC_IDX \ ((long)optprobe_template_clac - (long)optprobe_template_entry) #define TMPL_MOVE_IDX \ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 0ffbae902e2f..11c45ce42694 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -97,6 +97,7 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, DEBUGP("%s relocate section %u to %u\n", apply ? "Applying" : "Clearing", relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { size_t size; @@ -162,15 +163,17 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, if (apply) { if (memcmp(loc, &zero, size)) { - pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + pr_err("x86/modules: Invalid relocation target, existing value is nonzero for sec %u, idx %u, type %d, loc %lx, val %llx\n", + relsec, i, (int)ELF64_R_TYPE(rel[i].r_info), + (unsigned long)loc, val); return -ENOEXEC; } write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { - pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for sec %u, idx %u, type %d, loc %lx, val %llx\n", + relsec, i, (int)ELF64_R_TYPE(rel[i].r_info), + (unsigned long)loc, val); return -ENOEXEC; } write(loc, &zero, size); @@ -179,8 +182,8 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - pr_err("overflow in relocation type %d val %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), val); + pr_err("overflow in relocation type %d val %llx sec %u idx %d\n", + (int)ELF64_R_TYPE(rel[i].r_info), val, relsec, i); pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index eb289abece23..5cd6950ab672 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -103,9 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -/* CPUs which are the primary SMT threads */ -struct cpumask __cpu_primary_thread_mask __read_mostly; - /* Representing CPUs for which sibling maps can be computed */ static cpumask_var_t cpu_sibling_setup_mask; @@ -515,6 +512,76 @@ static void __init build_sched_topology(void) set_sched_topology(topology); } +#ifdef CONFIG_NUMA +static int sched_avg_remote_distance; +static int avg_remote_numa_distance(void) +{ + int i, j; + int distance, nr_remote, total_distance; + + if (sched_avg_remote_distance > 0) + return sched_avg_remote_distance; + + nr_remote = 0; + total_distance = 0; + for_each_node_state(i, N_CPU) { + for_each_node_state(j, N_CPU) { + distance = node_distance(i, j); + + if (distance >= REMOTE_DISTANCE) { + nr_remote++; + total_distance += distance; + } + } + } + if (nr_remote) + sched_avg_remote_distance = total_distance / nr_remote; + else + sched_avg_remote_distance = REMOTE_DISTANCE; + + return sched_avg_remote_distance; +} + +int arch_sched_node_distance(int from, int to) +{ + int d = node_distance(from, to); + + switch (boot_cpu_data.x86_vfm) { + case INTEL_GRANITERAPIDS_X: + case INTEL_ATOM_DARKMONT_X: + + if (!x86_has_numa_in_package || topology_max_packages() == 1 || + d < REMOTE_DISTANCE) + return d; + + /* + * With SNC enabled, there could be too many levels of remote + * NUMA node distances, creating NUMA domain levels + * including local nodes and partial remote nodes. + * + * Trim finer distance tuning for NUMA nodes in remote package + * for the purpose of building sched domains. Group NUMA nodes + * in the remote package in the same sched group. + * Simplify NUMA domains and avoid extra NUMA levels including + * different remote NUMA nodes and local nodes. + * + * GNR and CWF don't expect systems with more than 2 packages + * and more than 2 hops between packages. Single average remote + * distance won't be appropriate if there are more than 2 + * packages as average distance to different remote packages + * could be different. + */ + WARN_ONCE(topology_max_packages() > 2, + "sched: Expect only up to 2 packages for GNR or CWF, " + "but saw %d packages when building sched domains.", + topology_max_packages()); + + d = avg_remote_numa_distance(); + } + return d; +} +#endif /* CONFIG_NUMA */ + void set_cpu_sibling_map(int cpu) { bool has_smt = __max_threads_per_core > 1; @@ -1328,11 +1395,7 @@ void __noreturn hlt_play_dead(void) native_halt(); } -/* - * native_play_dead() is essentially a __noreturn function, but it can't - * be marked as such as the compiler may complain about it. - */ -void native_play_dead(void) +void __noreturn native_play_dead(void) { if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) __update_spec_ctrl(0); @@ -1351,7 +1414,7 @@ int native_cpu_disable(void) return -ENOSYS; } -void native_play_dead(void) +void __noreturn native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 378c388d1b31..2892cdb14563 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -26,6 +26,11 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; +/* + * ud1 (%edx),%rdi -- see __WARN_trap() / decode_bug() + */ +static const u8 warninsn[] = { 0x67, 0x48, 0x0f, 0xb9, 0x3a }; + static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */ { u8 ret = 0; @@ -69,7 +74,10 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, emulate = code; code = &xor5rax; } - + if (func == &__WARN_trap) { + emulate = code; + code = &warninsn; + } break; case NOP: @@ -128,7 +136,8 @@ static void __static_call_validate(u8 *insn, bool tail, bool tramp) } else { if (opcode == CALL_INSN_OPCODE || !memcmp(insn, x86_nops[5], 5) || - !memcmp(insn, xor5rax, 5)) + !memcmp(insn, xor5rax, 5) || + !memcmp(insn, warninsn, 5)) return; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6b22611e69cc..cb324cc1fd99 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -31,6 +31,7 @@ #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/static_call.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> @@ -102,25 +103,37 @@ __always_inline int is_valid_bugaddr(unsigned long addr) * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax * static_call: 0f b9 cc ud1 %esp,%ecx + * __WARN_trap: 67 48 0f b9 3a ud1 (%edx),%reg * - * Notably UBSAN uses EAX, static_call uses ECX. + * Notable, since __WARN_trap can use all registers, the distinction between + * UD1 users is through R/M. */ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) { unsigned long start = addr; + u8 v, reg, rm, rex = 0; + int type = BUG_UD1; bool lock = false; - u8 v; if (addr < TASK_SIZE_MAX) return BUG_NONE; - v = *(u8 *)(addr++); - if (v == INSN_ASOP) + for (;;) { v = *(u8 *)(addr++); + if (v == INSN_ASOP) + continue; - if (v == INSN_LOCK) { - lock = true; - v = *(u8 *)(addr++); + if (v == INSN_LOCK) { + lock = true; + continue; + } + + if ((v & 0xf0) == 0x40) { + rex = v; + continue; + } + + break; } switch (v) { @@ -156,18 +169,33 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) addr++; /* SIB */ + reg = X86_MODRM_REG(v) + 8*!!X86_REX_R(rex); + rm = X86_MODRM_RM(v) + 8*!!X86_REX_B(rex); + /* Decode immediate, if present */ switch (X86_MODRM_MOD(v)) { case 0: if (X86_MODRM_RM(v) == 5) - addr += 4; /* RIP + disp32 */ + addr += 4; /* RIP + disp32 */ + + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; + + if (rm == 2) { /* (%edx) */ + *imm = reg; + type = BUG_UD1_WARN; + } break; case 1: *imm = *(s8 *)addr; addr += 1; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; break; case 2: *imm = *(s32 *)addr; addr += 4; + if (rm == 0) /* (%eax) */ + type = BUG_UD1_UBSAN; break; case 3: break; @@ -176,12 +204,76 @@ __always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) /* record instruction length */ *len = addr - start; - if (X86_MODRM_REG(v) == 0) /* EAX */ - return BUG_UD1_UBSAN; + return type; +} - return BUG_UD1; +static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr) +{ + int offset = pt_regs_offset(regs, nr); + if (WARN_ON_ONCE(offset < -0)) + return 0; + return *((unsigned long *)((void *)regs + offset)); } +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS +DEFINE_STATIC_CALL(WARN_trap, __WARN_trap); +EXPORT_STATIC_CALL_TRAMP(WARN_trap); + +/* + * Create a va_list from an exception context. + */ +void *__warn_args(struct arch_va_list *args, struct pt_regs *regs) +{ + /* + * Register save area; populate with function call argument registers + */ + args->regs[0] = regs->di; + args->regs[1] = regs->si; + args->regs[2] = regs->dx; + args->regs[3] = regs->cx; + args->regs[4] = regs->r8; + args->regs[5] = regs->r9; + + /* + * From the ABI document: + * + * @gp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available general purpose + * argument register is saved. In case all argument registers have + * been exhausted, it is set to the value 48 (6*8). + * + * @fp_offset - the element holds the offset in bytes from + * reg_save_area to the place where the next available floating point + * argument is saved. In case all argument registers have been + * exhausted, it is set to the value 176 (6*8 + 8*16) + * + * @overflow_arg_area - this pointer is used to fetch arguments passed + * on the stack. It is initialized with the address of the first + * argument passed on the stack, if any, and then always updated to + * point to the start of the next argument on the stack. + * + * @reg_save_area - the element points to the start of the register + * save area. + * + * Notably the vararg starts with the second argument and there are no + * floating point arguments in the kernel. + */ + args->args.gp_offset = 1*8; + args->args.fp_offset = 6*8 + 8*16; + args->args.reg_save_area = &args->regs; + args->args.overflow_arg_area = (void *)regs->sp; + + /* + * If the exception came from __WARN_trap, there is a return + * address on the stack, skip that. This is why any __WARN_trap() + * caller must inhibit tail-call optimization. + */ + if ((void *)regs->ip == &__WARN_trap) + args->args.overflow_arg_area += 8; + + return &args->args; +} +#endif /* HAVE_ARCH_BUG_FORMAT */ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, @@ -334,6 +426,11 @@ static noinstr bool handle_bug(struct pt_regs *regs) raw_local_irq_enable(); switch (ud_type) { + case BUG_UD1_WARN: + if (report_bug_entry((void *)pt_regs_val(regs, ud_imm), regs) == BUG_TRAP_TYPE_WARN) + handled = true; + break; + case BUG_UD2: if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { handled = true; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 845aeaf36b8d..7be8e361ca55 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -17,6 +17,7 @@ #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mmu_context.h> #include <asm/nops.h> @@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; - int i; - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1158,35 +1158,12 @@ unlock: mmap_write_unlock(mm); } -static bool insn_is_nop(struct insn *insn) -{ - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; -} - -static bool insn_is_nopl(struct insn *insn) -{ - if (insn->opcode.nbytes != 2) - return false; - - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) - return false; - - if (!insn->modrm.nbytes) - return false; - - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) - return false; - - /* 0f 1f /0 - NOPL */ - return true; -} - static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) return false; - if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + if (!insn_is_nop(insn)) return false; /* We can't do cross page atomic writes yet. */ @@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; - int i; - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ - if (insn->length <= ASM_NOP_MAX && - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + if (insn_is_nop(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0x66) return -ENOTSUPP; } @@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, else return regs->sp <= ret->stack; } + +/* + * Heuristic-based check if uprobe is installed at the function entry. + * + * Under assumption of user code being compiled with frame pointers, + * `push %rbp/%ebp` is a good indicator that we indeed are. + * + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. + * If we get this wrong, captured stack trace might have one extra bogus + * entry, but the rest of stack trace will still be meaningful. + */ +bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + struct arch_uprobe *auprobe; + + if (!current->utask) + return false; + + auprobe = current->utask->auprobe; + if (!auprobe) + return false; + + /* push %rbp/%ebp */ + if (auprobe->insn[0] == 0x55) + return true; + + /* endbr64 (64-bit only) */ + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) + return true; + + return false; +} diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f286b5706d7c..fef00546c885 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) * This function is called from IOMMU driver to notify * SVM to schedule in a particular vCPU of a particular VM. */ -int avic_ga_log_notifier(u32 ga_tag) +static int avic_ga_log_notifier(u32 ga_tag) { unsigned long flags; struct kvm_svm *kvm_svm; @@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); + raw_spin_lock_init(&svm->ir_list_lock); if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; @@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd) if (!vcpu) return; - spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags); list_del(&irqfd->vcpu_list); - spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags); } int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, @@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, * list of IRQs being posted to the vCPU, to ensure the IRTE * isn't programmed with stale pCPU/IsRunning information. */ - guard(spinlock_irqsave)(&svm->ir_list_lock); + guard(raw_spinlock_irqsave)(&svm->ir_list_lock); /* * Update the target pCPU for IOMMU doorbells if the vCPU is @@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, * up-to-date entry information, or that this task will wait until * svm_ir_list_add() completes to set the new target pCPU. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); entry = svm->avic_physical_id_entry; WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); @@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action); - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) * or that this task will wait until svm_ir_list_add() completes to * mark the vCPU as not running. */ - spin_lock_irqsave(&svm->ir_list_lock, flags); + raw_spin_lock_irqsave(&svm->ir_list_lock, flags); avic_update_iommu_vcpu_affinity(vcpu, -1, action); @@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) svm->avic_physical_id_entry = entry; - spin_unlock_irqrestore(&svm->ir_list_lock, flags); + raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void) return true; } + +void avic_hardware_unsetup(void) +{ + if (avic) + amd_iommu_register_ga_log_notifier(NULL); +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a6443feab252..da6e80b3ac35 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 */ svm_copy_lbrs(vmcb02, vmcb12); vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; - svm_update_lbrv(&svm->vcpu); - - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + } else { svm_copy_lbrs(vmcb02, vmcb01); } + svm_update_lbrv(&svm->vcpu); } static inline bool is_evtinj_soft(u32 evtinj) @@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_next_rip = vmcb12_rip; } - vmcb02->control.virt_ext = vmcb01->control.virt_ext & - LBR_CTL_ENABLE_MASK; - if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) - vmcb02->control.virt_ext |= - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; @@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) svm_copy_lbrs(vmcb12, vmcb02); - svm_update_lbrv(vcpu); - } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + else svm_copy_lbrs(vmcb01, vmcb02); - svm_update_lbrv(vcpu); - } + + svm_update_lbrv(vcpu); if (vnmi) { if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 153c12dbf3eb..9d29b2e7e855 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { - bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + + if (intercept == svm->lbr_msrs_intercepted) + return; svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept); svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept); @@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept); + + svm->lbr_msrs_intercepted = intercept; } void svm_vcpu_free_msrpm(void *msrpm) @@ -806,60 +812,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -void svm_enable_lbrv(struct kvm_vcpu *vcpu) +static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - svm_recalc_lbr_msr_intercepts(vcpu); - - /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); + to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; } -static void svm_disable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - - KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; + __svm_enable_lbrv(vcpu); svm_recalc_lbr_msr_intercepts(vcpu); - - /* - * Move the LBR msrs back to the vmcb01 to avoid copying them - * on nested guest entries. - */ - if (is_guest_mode(vcpu)) - svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); } -static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) +static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { - /* - * If LBR virtualization is disabled, the LBR MSRs are always kept in - * vmcb01. If LBR virtualization is enabled and L1 is running VMs of - * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. - */ - return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : - svm->vmcb01.ptr; + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; - bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || + bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); - if (enable_lbrv == current_enable_lbrv) - return; + if (enable_lbrv && !current_enable_lbrv) + __svm_enable_lbrv(vcpu); + else if (!enable_lbrv && current_enable_lbrv) + __svm_disable_lbrv(vcpu); - if (enable_lbrv) - svm_enable_lbrv(vcpu); - else - svm_disable_lbrv(vcpu); + /* + * During nested transitions, it is possible that the current VMCB has + * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa). + * In this case, even though LBR_CTL does not need an update, intercepts + * do, so always recalculate the intercepts here. + */ + svm_recalc_lbr_msr_intercepts(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -921,6 +910,8 @@ static void svm_hardware_unsetup(void) { int cpu; + avic_hardware_unsetup(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) @@ -1236,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) } svm->x2avic_msrs_intercepted = true; + svm->lbr_msrs_intercepted = true; svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); @@ -2722,19 +2714,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -3002,7 +2994,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - svm_get_lbr_vmcb(svm)->save.dbgctl = data; + if (svm->vmcb->save.dbgctl == data) + break; + + svm->vmcb->save.dbgctl = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: @@ -5386,12 +5382,6 @@ static __init int svm_hardware_setup(void) svm_hv_hardware_setup(); - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - enable_apicv = avic_hardware_setup(); if (!enable_apicv) { enable_ipiv = false; @@ -5435,6 +5425,13 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + return 0; err: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e4b04f435b3d..dd78e6402345 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -329,13 +329,14 @@ struct vcpu_svm { * back into remapped mode). */ struct list_head ir_list; - spinlock_t ir_list_lock; + raw_spinlock_t ir_list_lock; struct vcpu_sev_es_state sev_es; bool guest_state_loaded; bool x2avic_msrs_intercepted; + bool lbr_msrs_intercepted; /* Guest GIF value, used when vGIF is not enabled */ bool guest_gif; @@ -805,7 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; ) bool __init avic_hardware_setup(void); -int avic_ga_log_notifier(u32 ga_tag); +void avic_hardware_unsetup(void); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index bc5ece76533a..412d0829d7a2 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; - if (error_code & EPT_VIOLATION_GVA_IS_VALID) + if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 76271962cb70..bcea087b642f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; + case EXIT_REASON_SEAMCALL: + case EXIT_REASON_TDCALL: + /* + * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't + * virtualized by KVM for L1 hypervisors, i.e. L1 should + * never want or expect such an exit. + */ + return false; default: return true; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f87c216d976d..91b6f2f3edc2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) return 1; } +static int handle_tdx_instruction(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + #ifndef CONFIG_X86_SGX_KVM static int handle_encls(struct kvm_vcpu *vcpu) { @@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, [EXIT_REASON_NOTIFY] = handle_notify, + [EXIT_REASON_SEAMCALL] = handle_tdx_instruction, + [EXIT_REASON_TDCALL] = handle_tdx_instruction, [EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm, [EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..c9c2aa6f4705 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu) /* * Returns true if the MSR in question is managed via XSTATE, i.e. is context - * switched with the rest of guest FPU state. Note! S_CET is _not_ context - * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. - * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, - * the value saved/restored via XSTATE is always the host's value. That detail - * is _extremely_ important, as the guest's S_CET must _never_ be resident in - * hardware while executing in the host. Loading guest values for U_CET and - * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to - * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower - * privilege levels, i.e. are effectively only consumed by userspace as well. + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. */ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) { @@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) * MSR that is managed via XSTATE. Note, the caller is responsible for doing * the initial FPU load, this helper only ensures that guest state is resident * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. */ static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info, @@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); @@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); @@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int r; vcpu_load(vcpu); - if (kvm_mpx_supported()) - kvm_load_guest_fpu(vcpu); - kvm_vcpu_srcu_read_lock(vcpu); r = kvm_apic_accept_events(vcpu); @@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, out: kvm_vcpu_srcu_read_unlock(vcpu); - - if (kvm_mpx_supported()) - kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return r; } @@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) { struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; u64 xfeatures_mask; + bool fpu_in_use; int i; /* @@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); /* - * All paths that lead to INIT are required to load the guest's FPU - * state (because most paths are buried in KVM_RUN). - */ - kvm_put_guest_fpu(vcpu); + * Unload guest FPU state (if necessary) before zeroing XSTATE fields + * as the kernel can only modify the state when its resident in memory, + * i.e. when it's not loaded into hardware. + * + * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN, + * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the + * only path that can trigger INIT emulation _and_ loads FPU state, and + * KVM_RUN should _always_ load FPU state. + */ + WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use); + fpu_in_use = fpstate->in_use; + if (fpu_in_use) + kvm_put_guest_fpu(vcpu); for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) fpstate_clear_xstate_component(fpstate, i); - kvm_load_guest_fpu(vcpu); + if (fpu_in_use) + kvm_load_guest_fpu(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4e385cbfd444..e03eeec55cfe 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -63,11 +63,10 @@ static bool is_string_insn(struct insn *insn) bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } @@ -92,13 +91,13 @@ bool insn_has_rep_prefix(struct insn *insn) static int get_seg_reg_override_idx(struct insn *insn) { int idx = INAT_SEG_REG_DEFAULT; - int num_overrides = 0, i; + int num_overrides = 0; insn_byte_t p; insn_get_prefixes(insn); /* Look for any segment override prefixes. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1676,3 +1675,147 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) return type; } + +/* + * Recognise typical NOP patterns for both 32bit and 64bit. + * + * Notably: + * - NOP, but not: REP NOP aka PAUSE + * - NOPL + * - MOV %reg, %reg + * - LEA 0(%reg),%reg + * - JMP +0 + * + * Must not have false-positives; instructions identified as a NOP might be + * emulated as a NOP (uprobe) or Run Length Encoded in a larger NOP + * (alternatives). + * + * False-negatives are fine; need not be exhaustive. + */ +bool insn_is_nop(struct insn *insn) +{ + u8 b3 = 0, x3 = 0, r3 = 0; + u8 b4 = 0, x4 = 0, r4 = 0, m = 0; + u8 modrm, modrm_mod, modrm_reg, modrm_rm; + u8 sib = 0, sib_scale, sib_index, sib_base; + u8 nrex, rex; + u8 p, rep = 0; + + if ((nrex = insn->rex_prefix.nbytes)) { + rex = insn->rex_prefix.bytes[nrex-1]; + + r3 = !!X86_REX_R(rex); + x3 = !!X86_REX_X(rex); + b3 = !!X86_REX_B(rex); + if (nrex > 1) { + r4 = !!X86_REX2_R(rex); + x4 = !!X86_REX2_X(rex); + b4 = !!X86_REX2_B(rex); + m = !!X86_REX2_M(rex); + } + + } else if (insn->vex_prefix.nbytes) { + /* + * Ignore VEX encoded NOPs + */ + return false; + } + + if (insn->modrm.nbytes) { + modrm = insn->modrm.bytes[0]; + modrm_mod = X86_MODRM_MOD(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*r3 + 16*r4; + modrm_rm = X86_MODRM_RM(modrm) + 8*b3 + 16*b4; + modrm = 1; + } + + if (insn->sib.nbytes) { + sib = insn->sib.bytes[0]; + sib_scale = X86_SIB_SCALE(sib); + sib_index = X86_SIB_INDEX(sib) + 8*x3 + 16*x4; + sib_base = X86_SIB_BASE(sib) + 8*b3 + 16*b4; + sib = 1; + + modrm_rm = sib_base; + } + + for_each_insn_prefix(insn, p) { + if (p == 0xf3) /* REPE */ + rep = 1; + } + + /* + * Opcode map munging: + * + * REX2: 0 - single byte opcode + * 1 - 0f second byte opcode + */ + switch (m) { + case 0: break; + case 1: insn->opcode.value <<= 8; + insn->opcode.value |= 0x0f; + break; + default: + return false; + } + + switch (insn->opcode.bytes[0]) { + case 0x0f: /* 2nd byte */ + break; + + case 0x89: /* MOV */ + if (modrm_mod != 3) /* register-direct */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ + + case 0x8d: /* LEA */ + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + if (insn->displacement.value != 0) + return false; + + if (sib && (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ + return false; + + for_each_insn_prefix(insn, p) { + if (p != 0x3e) /* DS */ + return false; + } + + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ + + case 0x90: /* NOP */ + if (b3 || b4) /* XCHG %r{8,16,24},%rax */ + return false; + + if (rep) /* REP NOP := PAUSE */ + return false; + + return true; + + case 0xe9: /* JMP.d32 */ + case 0xeb: /* JMP.d8 */ + return insn->immediate.value == 0; /* JMP +0 */ + + default: + return false; + } + + switch (insn->opcode.bytes[1]) { + case 0x1f: + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ + + default: + return false; + } +} diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index b5893928d55c..8c7cd115b484 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -22,7 +22,7 @@ #include <asm/setup.h> #define debug_putstr(v) early_printk("%s", v) -#define has_cpuflag(f) boot_cpu_has(f) +#define has_cpuflag(f) cpu_feature_enabled(f) #define get_boot_seed() kaslr_offset() #endif diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h index fc1c887ca073..654bfe4e29a0 100644 --- a/arch/x86/math-emu/poly.h +++ b/arch/x86/math-emu/poly.h @@ -39,7 +39,7 @@ asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult); asmlinkage void shr_Xsig(Xsig *, const int n); asmlinkage int round_Xsig(Xsig *); asmlinkage int norm_Xsig(Xsig *); -asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); +asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, Xsig *dest); /* Macro to extract the most significant 32 bits from a long long */ #define LL_MSW(x) (((unsigned long *)&x)[1]) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0e4270e20fad..1044aafd5d94 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -504,9 +504,6 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, continue; } - if (0) - pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, - pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index fc3f3d3e2ef2..8d31c6b9e184 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -31,17 +31,6 @@ unsigned long __phys_addr(unsigned long x) return x; } EXPORT_SYMBOL(__phys_addr); - -unsigned long __phys_addr_symbol(unsigned long x) -{ - unsigned long y = x - __START_KERNEL_map; - - /* only check upper bounds since lower bounds will trigger carry */ - VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); - - return y + phys_base; -} -EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 374e4cb788d8..438a3b170402 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/block/bdev.c b/block/bdev.c index 810707cca970..b8fbb9576110 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -67,7 +67,7 @@ static void bdev_write_inode(struct block_device *bdev) int ret; spin_lock(&inode->i_lock); - while (inode->i_state & I_DIRTY) { + while (inode_state_read(inode) & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) @@ -217,9 +217,26 @@ int set_blocksize(struct file *file, int size) EXPORT_SYMBOL(set_blocksize); +static int sb_validate_large_blocksize(struct super_block *sb, int size) +{ + const char *err_str = NULL; + + if (!(sb->s_type->fs_flags & FS_LBS)) + err_str = "not supported by filesystem"; + else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE"; + + if (!err_str) + return 0; + + pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n", + sb->s_type->name, size, PAGE_SIZE, err_str); + return -EINVAL; +} + int sb_set_blocksize(struct super_block *sb, int size) { - if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) + if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size)) return 0; if (set_blocksize(sb->s_bdev_file, size)) return 0; @@ -231,7 +248,7 @@ int sb_set_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_set_blocksize); -int sb_min_blocksize(struct super_block *sb, int size) +int __must_check sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) @@ -1265,7 +1282,7 @@ void sync_bdevs(bool wait) struct block_device *bdev; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; diff --git a/block/fops.c b/block/fops.c index 5e3db9fead77..4dad9c2d5796 100644 --- a/block/fops.c +++ b/block/fops.c @@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = { #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { - return iomap_read_folio(folio, &blkdev_iomap_ops); + iomap_bio_read_folio(folio, &blkdev_iomap_ops); + return 0; } static void blkdev_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &blkdev_iomap_ops); + iomap_bio_readahead(rac, &blkdev_iomap_ops); } static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c index a6dbf623e557..6d69554c940e 100644 --- a/drivers/acpi/acpi_mrrm.c +++ b/drivers/acpi/acpi_mrrm.c @@ -152,26 +152,49 @@ ATTRIBUTE_GROUPS(memory_range); static __init int add_boot_memory_ranges(void) { - struct kobject *pkobj, *kobj; + struct kobject *pkobj, *kobj, **kobjs; int ret = -EINVAL; - char *name; + char name[16]; + int i; pkobj = kobject_create_and_add("memory_ranges", acpi_kobj); + if (!pkobj) + return -ENOMEM; - for (int i = 0; i < mrrm_mem_entry_num; i++) { - name = kasprintf(GFP_KERNEL, "range%d", i); - if (!name) { - ret = -ENOMEM; - break; - } + kobjs = kcalloc(mrrm_mem_entry_num, sizeof(*kobjs), GFP_KERNEL); + if (!kobjs) { + kobject_put(pkobj); + return -ENOMEM; + } + for (i = 0; i < mrrm_mem_entry_num; i++) { + scnprintf(name, sizeof(name), "range%d", i); kobj = kobject_create_and_add(name, pkobj); + if (!kobj) { + ret = -ENOMEM; + goto cleanup; + } ret = sysfs_create_groups(kobj, memory_range_groups); - if (ret) - return ret; + if (ret) { + kobject_put(kobj); + goto cleanup; + } + kobjs[i] = kobj; } + kfree(kobjs); + return 0; + +cleanup: + for (int j = 0; j < i; j++) { + if (kobjs[j]) { + sysfs_remove_groups(kobjs[j], memory_range_groups); + kobject_put(kobjs[j]); + } + } + kfree(kobjs); + kobject_put(pkobj); return ret; } diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 3c87953dbd19..305c240a303f 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -182,6 +182,7 @@ bool einj_initialized __ro_after_init; static void __iomem *einj_param; static u32 v5param_size; +static u32 v66param_size; static bool is_v2; static void einj_exec_ctx_init(struct apei_exec_context *ctx) @@ -283,6 +284,24 @@ static void check_vendor_extension(u64 paddr, acpi_os_unmap_iomem(p, sizeof(v)); } +static u32 einjv2_init(struct einjv2_extension_struct *e) +{ + if (e->revision != 1) { + pr_info("Unknown v2 extension revision %u\n", e->revision); + return 0; + } + if (e->length < sizeof(*e) || e->length > PAGE_SIZE) { + pr_info(FW_BUG "Bad1 v2 extension length %u\n", e->length); + return 0; + } + if ((e->length - sizeof(*e)) % sizeof(e->component_arr[0])) { + pr_info(FW_BUG "Bad2 v2 extension length %u\n", e->length); + return 0; + } + + return (e->length - sizeof(*e)) / sizeof(e->component_arr[0]); +} + static void __iomem *einj_get_parameter_address(void) { int i; @@ -310,28 +329,21 @@ static void __iomem *einj_get_parameter_address(void) v5param_size = sizeof(v5param); p = acpi_os_map_iomem(pa_v5, sizeof(*p)); if (p) { - int offset, len; - memcpy_fromio(&v5param, p, v5param_size); acpi5 = 1; check_vendor_extension(pa_v5, &v5param); - if (is_v2 && available_error_type & ACPI65_EINJV2_SUPP) { - len = v5param.einjv2_struct.length; - offset = offsetof(struct einjv2_extension_struct, component_arr); - max_nr_components = (len - offset) / - sizeof(v5param.einjv2_struct.component_arr[0]); - /* - * The first call to acpi_os_map_iomem above does not include the - * component array, instead it is used to read and calculate maximum - * number of components supported by the system. Below, the mapping - * is expanded to include the component array. - */ + if (available_error_type & ACPI65_EINJV2_SUPP) { + struct einjv2_extension_struct *e; + + e = &v5param.einjv2_struct; + max_nr_components = einjv2_init(e); + + /* remap including einjv2_extension_struct */ acpi_os_unmap_iomem(p, v5param_size); - offset = offsetof(struct set_error_type_with_address, einjv2_struct); - v5param_size = offset + struct_size(&v5param.einjv2_struct, - component_arr, max_nr_components); - p = acpi_os_map_iomem(pa_v5, v5param_size); + v66param_size = v5param_size - sizeof(*e) + e->length; + p = acpi_os_map_iomem(pa_v5, v66param_size); } + return p; } } @@ -527,6 +539,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, u64 param3, u64 param4) { struct apei_exec_context ctx; + u32 param_size = is_v2 ? v66param_size : v5param_size; u64 val, trigger_paddr, timeout = FIRMWARE_TIMEOUT; int i, rc; @@ -539,11 +552,11 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, if (acpi5) { struct set_error_type_with_address *v5param; - v5param = kmalloc(v5param_size, GFP_KERNEL); + v5param = kmalloc(param_size, GFP_KERNEL); if (!v5param) return -ENOMEM; - memcpy_fromio(v5param, einj_param, v5param_size); + memcpy_fromio(v5param, einj_param, param_size); v5param->type = type; if (type & ACPI5_VENDOR_BIT) { switch (vendor_flags) { @@ -601,7 +614,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, break; } } - memcpy_toio(einj_param, v5param, v5param_size); + memcpy_toio(einj_param, v5param, param_size); kfree(v5param); } else { rc = apei_exec_run(&ctx, ACPI_EINJ_SET_ERROR_TYPE); @@ -1132,9 +1145,14 @@ static void einj_remove(struct faux_device *fdev) struct apei_exec_context ctx; if (einj_param) { - acpi_size size = (acpi5) ? - v5param_size : - sizeof(struct einj_parameter); + acpi_size size; + + if (v66param_size) + size = v66param_size; + else if (acpi5) + size = v5param_size; + else + size = sizeof(struct einj_parameter); acpi_os_unmap_iomem(einj_param, size); if (vendor_errors.size) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index fd995a1d3d24..8cc8af8fd408 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -430,10 +430,10 @@ static int __init gtdt_platform_timer_init(void) continue; pdev = platform_device_register_data(NULL, "gtdt-arm-mmio-timer", - gwdt_count, &atm, + mmio_timer_count, &atm, sizeof(atm)); if (IS_ERR(pdev)) { - pr_err("Can't register timer %d\n", gwdt_count); + pr_err("Can't register timer %d\n", mmio_timer_count); continue; } diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index ab4651205e8a..3bdeeee3414e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -460,7 +460,7 @@ bool acpi_cpc_valid(void) if (acpi_disabled) return false; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); if (!cpc_ptr) return false; @@ -476,7 +476,7 @@ bool cppc_allow_fast_switch(void) struct cpc_desc *cpc_ptr; int cpu; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { cpc_ptr = per_cpu(cpc_desc_ptr, cpu); desired_reg = &cpc_ptr->cpc_regs[DESIRED_PERF]; if (!CPC_IN_SYSTEM_MEMORY(desired_reg) && @@ -750,7 +750,7 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) } /* - * Disregard _CPC if the number of entries in the return pachage is not + * Disregard _CPC if the number of entries in the return package is not * as expected, but support future revisions being proper supersets of * the v3 and only causing more entries to be returned by _CPC. */ @@ -1435,7 +1435,7 @@ bool cppc_perf_ctrs_in_pcc(void) { int cpu; - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { struct cpc_register_resource *ref_perf_reg; struct cpc_desc *cpc_desc; diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c index 76a856c32c4d..d1595156c86a 100644 --- a/drivers/acpi/irq.c +++ b/drivers/acpi/irq.c @@ -300,6 +300,25 @@ int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) } EXPORT_SYMBOL_GPL(acpi_irq_get); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + struct irq_fwspec_info info; + struct irq_fwspec fwspec; + unsigned long flags; + + if (acpi_irq_parse_one(handle, index, &fwspec, &flags)) + return NULL; + + if (irq_populate_fwspec_info(&fwspec, &info)) + return NULL; + + if (!(info.flags & IRQ_FWSPEC_INFO_AFFINITY_VALID)) + return NULL; + + return info.affinity; +} + /** * acpi_set_irq_model - Setup the GSI irqdomain information * @model: the value assigned to acpi_irq_model diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 5a36d57289b4..11e4483685c9 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -874,11 +874,33 @@ static void hmat_register_target_devices(struct memory_target *target) } } -static void hmat_register_target(struct memory_target *target) +static void hmat_hotplug_target(struct memory_target *target) { int nid = pxm_to_node(target->memory_pxm); /* + * Skip offline nodes. This can happen when memory marked EFI_MEMORY_SP, + * "specific purpose", is applied to all the memory in a proximity + * domain leading to * the node being marked offline / unplugged, or if + * memory-only "hotplug" node is offline. + */ + if (nid == NUMA_NO_NODE || !node_online(nid)) + return; + + guard(mutex)(&target_lock); + if (target->registered) + return; + + hmat_register_target_initiators(target); + hmat_register_target_cache(target); + hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL); + hmat_register_target_perf(target, ACCESS_COORDINATE_CPU); + target->registered = true; +} + +static void hmat_register_target(struct memory_target *target) +{ + /* * Devices may belong to either an offline or online * node, so unconditionally add them. */ @@ -895,25 +917,7 @@ static void hmat_register_target(struct memory_target *target) } mutex_unlock(&target_lock); - /* - * Skip offline nodes. This can happen when memory - * marked EFI_MEMORY_SP, "specific purpose", is applied - * to all the memory in a proximity domain leading to - * the node being marked offline / unplugged, or if - * memory-only "hotplug" node is offline. - */ - if (nid == NUMA_NO_NODE || !node_online(nid)) - return; - - mutex_lock(&target_lock); - if (!target->registered) { - hmat_register_target_initiators(target); - hmat_register_target_cache(target); - hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL); - hmat_register_target_perf(target, ACCESS_COORDINATE_CPU); - target->registered = true; - } - mutex_unlock(&target_lock); + hmat_hotplug_target(target); } static void hmat_register_targets(void) @@ -939,7 +943,7 @@ static int hmat_callback(struct notifier_block *self, if (!target) return NOTIFY_OK; - hmat_register_target(target); + hmat_hotplug_target(target); return NOTIFY_OK; } diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 53816dfab645..aa87ee1583a4 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -237,7 +237,7 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) struct acpi_srat_generic_affinity *p = (struct acpi_srat_generic_affinity *)header; - if (p->device_handle_type == 0) { + if (p->device_handle_type == 1) { /* * For pci devices this may be the only place they * are assigned a proximity domain diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c index 6792d4385eee..7b8b5d2015ec 100644 --- a/drivers/acpi/prmt.c +++ b/drivers/acpi/prmt.c @@ -244,6 +244,12 @@ static struct prm_handler_info *find_prm_handler(const guid_t *guid) return (struct prm_handler_info *) find_guid_info(guid, GET_HANDLER); } +bool acpi_prm_handler_available(const guid_t *guid) +{ + return find_prm_handler(guid) && find_prm_module(guid); +} +EXPORT_SYMBOL_GPL(acpi_prm_handler_available); + /* In-coming PRM commands */ #define PRM_CMD_RUN_SERVICE 0 diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 5d824435b26b..65e779be64ff 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -166,7 +166,8 @@ static int __acpi_processor_start(struct acpi_device *device) if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS)) dev_dbg(&device->dev, "CPPC data invalid or not present\n"); - acpi_processor_power_init(pr); + if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver) + acpi_processor_power_init(pr); acpi_pss_perf_init(pr); @@ -262,8 +263,6 @@ static int __init acpi_processor_driver_init(void) if (result < 0) return result; - acpi_processor_register_idle_driver(); - result = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "acpi/cpu-drv:online", acpi_soft_cpu_online, NULL); @@ -302,7 +301,6 @@ static void __exit acpi_processor_driver_exit(void) cpuhp_remove_state_nocalls(hp_online); cpuhp_remove_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD); - acpi_processor_unregister_idle_driver(); driver_unregister(&acpi_processor_driver); } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 22b051b94a86..4166090db642 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644); static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device); -static struct cpuidle_driver acpi_idle_driver = { +struct cpuidle_driver acpi_idle_driver = { .name = "acpi_idle", .owner = THIS_MODULE, }; @@ -1357,102 +1357,79 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr) return 0; } -void acpi_processor_register_idle_driver(void) -{ - struct acpi_processor *pr; - int ret = -ENODEV; - int cpu; - - /* - * Acpi idle driver is used by all possible CPUs. - * Install the idle handler by the processor power info of one in them. - * Note that we use previously set idle handler will be used on - * platforms that only support C1. - */ - for_each_cpu(cpu, (struct cpumask *)cpu_possible_mask) { - pr = per_cpu(processors, cpu); - if (!pr) - continue; - - ret = acpi_processor_get_power_info(pr); - if (!ret) { - pr->flags.power_setup_done = 1; - acpi_processor_setup_cpuidle_states(pr); - break; - } - } - - if (ret) { - pr_debug("No ACPI power information from any CPUs.\n"); - return; - } +static int acpi_processor_registered; - ret = cpuidle_register_driver(&acpi_idle_driver); - if (ret) { - pr_debug("register %s failed.\n", acpi_idle_driver.name); - return; - } - pr_debug("%s registered with cpuidle.\n", acpi_idle_driver.name); -} - -void acpi_processor_unregister_idle_driver(void) -{ - cpuidle_unregister_driver(&acpi_idle_driver); -} - -void acpi_processor_power_init(struct acpi_processor *pr) +int acpi_processor_power_init(struct acpi_processor *pr) { + int retval; struct cpuidle_device *dev; - /* - * The code below only works if the current cpuidle driver is the ACPI - * idle driver. - */ - if (cpuidle_get_driver() != &acpi_idle_driver) - return; - if (disabled_by_idle_boot_param()) - return; + return 0; acpi_processor_cstate_first_run_checks(); if (!acpi_processor_get_power_info(pr)) pr->flags.power_setup_done = 1; - if (!pr->flags.power) - return; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return; + /* + * Install the idle handler if processor power management is supported. + * Note that we use previously set idle handler will be used on + * platforms that only support C1. + */ + if (pr->flags.power) { + /* Register acpi_idle_driver if not already registered */ + if (!acpi_processor_registered) { + acpi_processor_setup_cpuidle_states(pr); + retval = cpuidle_register_driver(&acpi_idle_driver); + if (retval) + return retval; + pr_debug("%s registered with cpuidle\n", + acpi_idle_driver.name); + } - per_cpu(acpi_cpuidle_device, pr->id) = dev; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + per_cpu(acpi_cpuidle_device, pr->id) = dev; - acpi_processor_setup_cpuidle_dev(pr, dev); + acpi_processor_setup_cpuidle_dev(pr, dev); - /* - * Register a cpuidle device for this CPU. The cpuidle driver using - * this device is expected to be registered. - */ - if (cpuidle_register_device(dev)) { - per_cpu(acpi_cpuidle_device, pr->id) = NULL; - kfree(dev); + /* Register per-cpu cpuidle_device. Cpuidle driver + * must already be registered before registering device + */ + retval = cpuidle_register_device(dev); + if (retval) { + if (acpi_processor_registered == 0) + cpuidle_unregister_driver(&acpi_idle_driver); + + per_cpu(acpi_cpuidle_device, pr->id) = NULL; + kfree(dev); + return retval; + } + acpi_processor_registered++; } + return 0; } -void acpi_processor_power_exit(struct acpi_processor *pr) +int acpi_processor_power_exit(struct acpi_processor *pr) { struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id); if (disabled_by_idle_boot_param()) - return; + return 0; if (pr->flags.power) { cpuidle_unregister_device(dev); + acpi_processor_registered--; + if (acpi_processor_registered == 0) + cpuidle_unregister_driver(&acpi_idle_driver); + kfree(dev); } pr->flags.power_setup_done = 0; + return 0; } MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE"); diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index a3f95a3fffde..d3edc3bcbf01 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -487,7 +487,7 @@ static int acpi_battery_read(struct acpi_battery *battery) if (result) return result; - battery->present = state & (1 << battery->id); + battery->present = !!(state & (1 << battery->id)); if (!battery->present) return 0; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 2a210719c4ce..f48fb63d7e85 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3006,6 +3006,16 @@ int ata_dev_configure(struct ata_device *dev) } dev->n_sectors = ata_id_n_sectors(id); + if (ata_id_is_locked(id)) { + /* + * If Security locked, set capacity to zero to prevent + * any I/O, e.g. partition scanning, as any I/O to a + * locked drive will result in user visible errors. + */ + ata_dev_info(dev, + "Security locked, setting capacity to zero\n"); + dev->n_sectors = 0; + } /* get current R/W Multiple count setting */ if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) { diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index b43a3196e2be..434774e71fe6 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -992,6 +992,13 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc) return; } + if (ata_id_is_locked(dev->id)) { + /* Security locked */ + /* LOGICAL UNIT ACCESS NOT AUTHORIZED */ + ata_scsi_set_sense(dev, cmd, DATA_PROTECT, 0x74, 0x71); + return; + } + if (!(qc->flags & ATA_QCFLAG_RTF_FILLED)) { ata_dev_dbg(dev, "Missing result TF: reporting aborted command\n"); @@ -4894,8 +4901,10 @@ void ata_scsi_dev_rescan(struct work_struct *work) spin_unlock_irqrestore(ap->lock, flags); if (do_resume) { ret = scsi_resume_device(sdev); - if (ret == -EWOULDBLOCK) + if (ret == -EWOULDBLOCK) { + scsi_device_put(sdev); goto unlock_scan; + } dev->flags &= ~ATA_DFLAG_RESUMING; } ret = scsi_rescan_device(sdev); diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index 4fea1149e003..f62e38571440 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -1374,7 +1374,9 @@ fore200e_open(struct atm_vcc *vcc) vcc->dev_data = NULL; + mutex_lock(&fore200e->rate_mtx); fore200e->available_cell_rate += vcc->qos.txtp.max_pcr; + mutex_unlock(&fore200e->rate_mtx); kfree(fore200e_vcc); return -EINVAL; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 9d4e46ad8352..2f576ecf1832 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -180,7 +180,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (IS_ERR(dentry)) return PTR_ERR(dentry); - dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL); if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; @@ -231,7 +231,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, - dev->devt); + dev->devt, NULL); if (!err) { struct iattr newattrs; @@ -261,7 +261,7 @@ static int dev_rmdir(const char *name) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), - dentry); + dentry, NULL); else err = -EPERM; diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 6942c62fa59d..bee3050a20d9 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -829,8 +829,6 @@ _request_firmware(const struct firmware **firmware_p, const char *name, size_t offset, u32 opt_flags) { struct firmware *fw = NULL; - struct cred *kern_cred = NULL; - const struct cred *old_cred; bool nondirect = false; int ret; @@ -871,45 +869,38 @@ _request_firmware(const struct firmware **firmware_p, const char *name, * called by a driver when serving an unrelated request from userland, we use * the kernel credentials to read the file. */ - kern_cred = prepare_kernel_cred(&init_task); - if (!kern_cred) { - ret = -ENOMEM; - goto out; - } - old_cred = override_creds(kern_cred); + scoped_with_kernel_creds() { + ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL); - ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL); - - /* Only full reads can support decompression, platform, and sysfs. */ - if (!(opt_flags & FW_OPT_PARTIAL)) - nondirect = true; + /* Only full reads can support decompression, platform, and sysfs. */ + if (!(opt_flags & FW_OPT_PARTIAL)) + nondirect = true; #ifdef CONFIG_FW_LOADER_COMPRESS_ZSTD - if (ret == -ENOENT && nondirect) - ret = fw_get_filesystem_firmware(device, fw->priv, ".zst", - fw_decompress_zstd); + if (ret == -ENOENT && nondirect) + ret = fw_get_filesystem_firmware(device, fw->priv, ".zst", + fw_decompress_zstd); #endif #ifdef CONFIG_FW_LOADER_COMPRESS_XZ - if (ret == -ENOENT && nondirect) - ret = fw_get_filesystem_firmware(device, fw->priv, ".xz", - fw_decompress_xz); + if (ret == -ENOENT && nondirect) + ret = fw_get_filesystem_firmware(device, fw->priv, ".xz", + fw_decompress_xz); #endif - if (ret == -ENOENT && nondirect) - ret = firmware_fallback_platform(fw->priv); + if (ret == -ENOENT && nondirect) + ret = firmware_fallback_platform(fw->priv); - if (ret) { - if (!(opt_flags & FW_OPT_NO_WARN)) - dev_warn(device, - "Direct firmware load for %s failed with error %d\n", - name, ret); - if (nondirect) - ret = firmware_fallback_sysfs(fw, name, device, - opt_flags, ret); - } else - ret = assign_fw(fw, device); - - revert_creds(old_cred); - put_cred(kern_cred); + if (ret) { + if (!(opt_flags & FW_OPT_NO_WARN)) + dev_warn(device, + "Direct firmware load for %s failed with error %d\n", + name, ret); + if (nondirect) + ret = firmware_fallback_sysfs(fw, name, device, + opt_flags, ret); + } else { + ret = assign_fw(fw, device); + } + } out: if (ret < 0) { diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 09450349cf32..b45d41b018ca 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -150,25 +150,37 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); #endif /* CONFIG_HAS_IOMEM */ +static const struct cpumask *get_irq_affinity(struct platform_device *dev, + unsigned int num) +{ + const struct cpumask *mask = NULL; +#ifndef CONFIG_SPARC + struct fwnode_handle *fwnode = dev_fwnode(&dev->dev); + + if (is_of_node(fwnode)) + mask = of_irq_get_affinity(to_of_node(fwnode), num); + else if (is_acpi_device_node(fwnode)) + mask = acpi_irq_get_affinity(ACPI_HANDLE_FWNODE(fwnode), num); +#endif + + return mask ?: cpu_possible_mask; +} + /** - * platform_get_irq_optional - get an optional IRQ for a device - * @dev: platform device - * @num: IRQ number index + * platform_get_irq_affinity - get an optional IRQ and its affinity for a device + * @dev: platform device + * @num: interrupt number index + * @affinity: optional cpumask pointer to get the affinity of a per-cpu interrupt * - * Gets an IRQ for a platform device. Device drivers should check the return - * value for errors so as to not pass a negative integer value to the - * request_irq() APIs. This is the same as platform_get_irq(), except that it - * does not print an error message if an IRQ can not be obtained. - * - * For example:: - * - * int irq = platform_get_irq_optional(pdev, 0); - * if (irq < 0) - * return irq; + * Gets an interupt for a platform device. Device drivers should check the + * return value for errors so as to not pass a negative integer value to + * the request_irq() APIs. Optional affinity information is provided in the + * affinity pointer if available, and NULL otherwise. * - * Return: non-zero IRQ number on success, negative error number on failure. + * Return: non-zero interrupt number on success, negative error number on failure. */ -int platform_get_irq_optional(struct platform_device *dev, unsigned int num) +int platform_get_irq_affinity(struct platform_device *dev, unsigned int num, + const struct cpumask **affinity) { int ret; #ifdef CONFIG_SPARC @@ -236,8 +248,37 @@ out_not_found: out: if (WARN(!ret, "0 is an invalid IRQ number\n")) return -EINVAL; + + if (ret > 0 && affinity) + *affinity = get_irq_affinity(dev, num); + return ret; } +EXPORT_SYMBOL_GPL(platform_get_irq_affinity); + +/** + * platform_get_irq_optional - get an optional interrupt for a device + * @dev: platform device + * @num: interrupt number index + * + * Gets an interrupt for a platform device. Device drivers should check the + * return value for errors so as to not pass a negative integer value to + * the request_irq() APIs. This is the same as platform_get_irq(), except + * that it does not print an error message if an interrupt can not be + * obtained. + * + * For example:: + * + * int irq = platform_get_irq_optional(pdev, 0); + * if (irq < 0) + * return irq; + * + * Return: non-zero interrupt number on success, negative error number on failure. + */ +int platform_get_irq_optional(struct platform_device *dev, unsigned int num) +{ + return platform_get_irq_affinity(dev, num, NULL); +} EXPORT_SYMBOL_GPL(platform_get_irq_optional); /** diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e83503bdc1fd..1de1cd72b616 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -888,12 +888,15 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy TRACE_DEVICE(dev); TRACE_RESUME(0); - if (dev->power.syscore || dev->power.direct_complete) + if (dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; + if (dev->power.syscore) + goto Skip; + if (!dpm_wait_for_superior(dev, async)) goto Out; @@ -926,11 +929,11 @@ Run: Skip: dev->power.is_late_suspended = false; + pm_runtime_enable(dev); Out: TRACE_RESUME(error); - pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { @@ -1615,12 +1618,6 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy TRACE_DEVICE(dev); TRACE_SUSPEND(0); - /* - * Disable runtime PM for the device without checking if there is a - * pending resume request for it. - */ - __pm_runtime_disable(dev, false); - dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) @@ -1631,9 +1628,18 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy goto Complete; } - if (dev->power.syscore || dev->power.direct_complete) + if (dev->power.direct_complete) goto Complete; + /* + * Disable runtime PM for the device without checking if there is a + * pending resume request for it. + */ + __pm_runtime_disable(dev, false); + + if (dev->power.syscore) + goto Skip; + if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); @@ -1664,6 +1670,7 @@ Run: WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); + pm_runtime_enable(dev); goto Complete; } dpm_propagate_wakeup_to_parent(dev); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a853c65ac65d..3263040fcf2d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -52,7 +52,6 @@ static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); static struct workqueue_struct *nbd_del_wq; -static struct cred *nbd_cred; static int nbd_total_devices = 0; struct nbd_sock { @@ -555,7 +554,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, int result; struct msghdr msg = {} ; unsigned int noreclaim_flag; - const struct cred *old_cred; if (unlikely(!sock)) { dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -564,33 +562,32 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, return -EINVAL; } - old_cred = override_creds(nbd_cred); - msg.msg_iter = *iter; noreclaim_flag = memalloc_noreclaim_save(); - do { - sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; - sock->sk->sk_use_task_frag = false; - msg.msg_flags = msg_flags | MSG_NOSIGNAL; - - if (send) - result = sock_sendmsg(sock, &msg); - else - result = sock_recvmsg(sock, &msg, msg.msg_flags); - - if (result <= 0) { - if (result == 0) - result = -EPIPE; /* short read */ - break; - } - if (sent) - *sent += result; - } while (msg_data_left(&msg)); - memalloc_noreclaim_restore(noreclaim_flag); + scoped_with_kernel_creds() { + do { + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; + sock->sk->sk_use_task_frag = false; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; - revert_creds(old_cred); + if (send) + result = sock_sendmsg(sock, &msg); + else + result = sock_recvmsg(sock, &msg, msg.msg_flags); + + if (result <= 0) { + if (result == 0) + result = -EPIPE; /* short read */ + break; + } + if (sent) + *sent += result; + } while (msg_data_left(&msg)); + } + + memalloc_noreclaim_restore(noreclaim_flag); return result; } @@ -2683,15 +2680,7 @@ static int __init nbd_init(void) return -ENOMEM; } - nbd_cred = prepare_kernel_cred(&init_task); - if (!nbd_cred) { - destroy_workqueue(nbd_del_wq); - unregister_blkdev(NBD_MAJOR, "nbd"); - return -ENOMEM; - } - if (genl_register_family(&nbd_genl_family)) { - put_cred(nbd_cred); destroy_workqueue(nbd_del_wq); unregister_blkdev(NBD_MAJOR, "nbd"); return -EINVAL; @@ -2746,7 +2735,6 @@ static void __exit nbd_cleanup(void) /* Also wait for nbd_dev_remove_work() completes */ destroy_workqueue(nbd_del_wq); - put_cred(nbd_cred); idr_destroy(&nbd_index_idr); unregister_blkdev(NBD_MAJOR, "nbd"); } diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 6abd962502e3..52794db2739b 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -50,7 +50,7 @@ #define RTL_CHIP_SUBVER (&(struct rtl_vendor_cmd) {{0x10, 0x38, 0x04, 0x28, 0x80}}) #define RTL_CHIP_REV (&(struct rtl_vendor_cmd) {{0x10, 0x3A, 0x04, 0x28, 0x80}}) -#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0x0D, 0x00, 0xb0}}) +#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0xAD, 0x00, 0xb0}}) #define RTL_PATCH_SNIPPETS 0x01 #define RTL_PATCH_DUMMY_HEADER 0x02 @@ -534,7 +534,6 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev, { struct rtl_epatch_header_v2 *hdr; int rc; - u8 reg_val[2]; u8 key_id; u32 num_sections; struct rtl_section *section; @@ -549,14 +548,7 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev, .len = btrtl_dev->fw_len - 7, /* Cut the tail */ }; - rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val); - if (rc < 0) - return -EIO; - key_id = reg_val[0]; - - rtl_dev_dbg(hdev, "%s: key id %u", __func__, key_id); - - btrtl_dev->key_id = key_id; + key_id = btrtl_dev->key_id; hdr = rtl_iov_pull_data(&iov, sizeof(*hdr)); if (!hdr) @@ -625,8 +617,10 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev, len += entry->len; } - if (!len) + if (!len) { + kvfree(ptr); return -EPERM; + } *_buf = ptr; return len; @@ -1068,6 +1062,8 @@ struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev, u16 hci_rev, lmp_subver; u8 hci_ver, lmp_ver, chip_type = 0; int ret; + int rc; + u8 key_id; u8 reg_val[2]; btrtl_dev = kzalloc(sizeof(*btrtl_dev), GFP_KERNEL); @@ -1178,6 +1174,14 @@ next: goto err_free; } + rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val); + if (rc < 0) + goto err_free; + + key_id = reg_val[0]; + btrtl_dev->key_id = key_id; + rtl_dev_info(hdev, "%s: key id %u", __func__, key_id); + btrtl_dev->fw_len = -EIO; if (lmp_subver == RTL_ROM_LMP_8852A && hci_rev == 0x000c) { snprintf(fw_name, sizeof(fw_name), "%s_v2.bin", @@ -1200,7 +1204,7 @@ next: goto err_free; } - if (btrtl_dev->ic_info->cfg_name) { + if (btrtl_dev->ic_info->cfg_name && !btrtl_dev->key_id) { if (postfix) { snprintf(cfg_name, sizeof(cfg_name), "%s-%s.bin", btrtl_dev->ic_info->cfg_name, postfix); diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 5e9ebf0c5312..fa683bb7f0b4 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2711,9 +2711,21 @@ static int btusb_recv_event_realtek(struct hci_dev *hdev, struct sk_buff *skb) static void btusb_mtk_claim_iso_intf(struct btusb_data *data) { - struct btmtk_data *btmtk_data = hci_get_priv(data->hdev); + struct btmtk_data *btmtk_data; int err; + if (!data->hdev) + return; + + btmtk_data = hci_get_priv(data->hdev); + if (!btmtk_data) + return; + + if (!btmtk_data->isopkt_intf) { + bt_dev_err(data->hdev, "Can't claim NULL iso interface"); + return; + } + /* * The function usb_driver_claim_interface() is documented to need * locks held if it's not called from a probe routine. The code here @@ -2735,17 +2747,30 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data) static void btusb_mtk_release_iso_intf(struct hci_dev *hdev) { - struct btmtk_data *btmtk_data = hci_get_priv(hdev); + struct btmtk_data *btmtk_data; + + if (!hdev) + return; + + btmtk_data = hci_get_priv(hdev); + if (!btmtk_data) + return; if (test_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags)) { usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor); clear_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags); - dev_kfree_skb_irq(btmtk_data->isopkt_skb); - btmtk_data->isopkt_skb = NULL; - usb_set_intfdata(btmtk_data->isopkt_intf, NULL); - usb_driver_release_interface(&btusb_driver, - btmtk_data->isopkt_intf); + if (btmtk_data->isopkt_skb) { + dev_kfree_skb_irq(btmtk_data->isopkt_skb); + btmtk_data->isopkt_skb = NULL; + } + + if (btmtk_data->isopkt_intf) { + usb_set_intfdata(btmtk_data->isopkt_intf, NULL); + usb_driver_release_interface(&btusb_driver, + btmtk_data->isopkt_intf); + btmtk_data->isopkt_intf = NULL; + } } clear_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags); @@ -4361,6 +4386,11 @@ static void btusb_disconnect(struct usb_interface *intf) hci_unregister_dev(hdev); + if (data->oob_wake_irq) + device_init_wakeup(&data->udev->dev, false); + if (data->reset_gpio) + gpiod_put(data->reset_gpio); + if (intf == data->intf) { if (data->isoc) usb_driver_release_interface(&btusb_driver, data->isoc); @@ -4371,17 +4401,11 @@ static void btusb_disconnect(struct usb_interface *intf) usb_driver_release_interface(&btusb_driver, data->diag); usb_driver_release_interface(&btusb_driver, data->intf); } else if (intf == data->diag) { - usb_driver_release_interface(&btusb_driver, data->intf); if (data->isoc) usb_driver_release_interface(&btusb_driver, data->isoc); + usb_driver_release_interface(&btusb_driver, data->intf); } - if (data->oob_wake_irq) - device_init_wakeup(&data->udev->dev, false); - - if (data->reset_gpio) - gpiod_put(data->reset_gpio); - hci_free_dev(hdev); } diff --git a/drivers/bus/fsl-mc/mc-sys.c b/drivers/bus/fsl-mc/mc-sys.c index b22c59d57c8f..31037f41893e 100644 --- a/drivers/bus/fsl-mc/mc-sys.c +++ b/drivers/bus/fsl-mc/mc-sys.c @@ -248,7 +248,7 @@ int mc_send_command(struct fsl_mc_io *mc_io, struct fsl_mc_command *cmd) enum mc_cmd_status status; unsigned long irq_flags = 0; - if (in_irq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL)) + if (in_hardirq() && !(mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL)) return -EINVAL; if (mc_io->flags & FSL_MC_IO_ATOMIC_CONTEXT_PORTAL) diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c index 70ce0ca0cb7d..0339c4af0fe5 100644 --- a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c +++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c @@ -121,11 +121,11 @@ static SUNXI_CCU_GATE_HW(bus_r_ir_rx_clk, "bus-r-ir-rx", &r_apb0_clk.common.hw, 0x1cc, BIT(0), 0); static SUNXI_CCU_GATE_HW(bus_r_dma_clk, "bus-r-dma", - &r_apb0_clk.common.hw, 0x1dc, BIT(0), 0); + &r_apb0_clk.common.hw, 0x1dc, BIT(0), CLK_IS_CRITICAL); static SUNXI_CCU_GATE_HW(bus_r_rtc_clk, "bus-r-rtc", &r_apb0_clk.common.hw, 0x20c, BIT(0), 0); static SUNXI_CCU_GATE_HW(bus_r_cpucfg_clk, "bus-r-cpucfg", - &r_apb0_clk.common.hw, 0x22c, BIT(0), 0); + &r_apb0_clk.common.hw, 0x22c, BIT(0), CLK_IS_CRITICAL); static struct ccu_common *sun55i_a523_r_ccu_clks[] = { &r_ahb_clk.common, diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c index acb532f8361b..20dad06b37ca 100644 --- a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c +++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c @@ -300,7 +300,7 @@ static struct ccu_nm pll_audio0_4x_clk = { .m = _SUNXI_CCU_DIV(16, 6), .sdm = _SUNXI_CCU_SDM(pll_audio0_sdm_table, BIT(24), 0x178, BIT(31)), - .min_rate = 180000000U, + .min_rate = 90000000U, .max_rate = 3000000000U, .common = { .reg = 0x078, diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index ffcd23668763..aa59e5b13351 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -782,4 +782,15 @@ config NXP_STM_TIMER Enables the support for NXP System Timer Module found in the s32g NXP platform series. +config RTK_SYSTIMER + bool "Realtek SYSTIMER support" + depends on ARM || ARM64 + depends on ARCH_REALTEK || COMPILE_TEST + select TIMER_OF + help + This option enables the driver that registers the global 1 MHz hardware + counter as a clock event device on Realtek SoCs. Make sure to enable + this option only when building for a Realtek platform or for compilation + testing. + endmenu diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index ec4452ee958f..b46376af6b49 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -95,3 +95,4 @@ obj-$(CONFIG_CLKSRC_LOONGSON1_PWM) += timer-loongson1-pwm.o obj-$(CONFIG_EP93XX_TIMER) += timer-ep93xx.o obj-$(CONFIG_RALINK_TIMER) += timer-ralink.o obj-$(CONFIG_NXP_STM_TIMER) += timer-nxp-stm.o +obj-$(CONFIG_RTK_SYSTIMER) += timer-realtek.o diff --git a/drivers/clocksource/arm_arch_timer_mmio.c b/drivers/clocksource/arm_arch_timer_mmio.c index ebe1987d651e..d10362692fdd 100644 --- a/drivers/clocksource/arm_arch_timer_mmio.c +++ b/drivers/clocksource/arm_arch_timer_mmio.c @@ -426,6 +426,7 @@ static struct platform_driver arch_timer_mmio_drv = { .driver = { .name = "arch-timer-mmio", .of_match_table = arch_timer_mmio_of_table, + .suppress_bind_attrs = true, }, .probe = arch_timer_mmio_probe, }; @@ -434,6 +435,7 @@ builtin_platform_driver(arch_timer_mmio_drv); static struct platform_driver arch_timer_mmio_acpi_drv = { .driver = { .name = "gtdt-arm-mmio-timer", + .suppress_bind_attrs = true, }, .probe = arch_timer_mmio_probe, }; diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 385eb94bbe7c..791b298c995b 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -355,14 +355,6 @@ static int sh_cmt_enable(struct sh_cmt_channel *ch) dev_pm_syscore_device(&ch->cmt->pdev->dev, true); - /* enable clock */ - ret = clk_enable(ch->cmt->clk); - if (ret) { - dev_err(&ch->cmt->pdev->dev, "ch%u: cannot enable clock\n", - ch->index); - goto err0; - } - /* make sure channel is disabled */ sh_cmt_start_stop_ch(ch, 0); @@ -384,19 +376,12 @@ static int sh_cmt_enable(struct sh_cmt_channel *ch) if (ret || sh_cmt_read_cmcnt(ch)) { dev_err(&ch->cmt->pdev->dev, "ch%u: cannot clear CMCNT\n", ch->index); - ret = -ETIMEDOUT; - goto err1; + return -ETIMEDOUT; } /* enable channel */ sh_cmt_start_stop_ch(ch, 1); return 0; - err1: - /* stop clock */ - clk_disable(ch->cmt->clk); - - err0: - return ret; } static void sh_cmt_disable(struct sh_cmt_channel *ch) @@ -407,9 +392,6 @@ static void sh_cmt_disable(struct sh_cmt_channel *ch) /* disable interrupts in CMT block */ sh_cmt_write_cmcsr(ch, 0); - /* stop clock */ - clk_disable(ch->cmt->clk); - dev_pm_syscore_device(&ch->cmt->pdev->dev, false); } @@ -583,8 +565,6 @@ static int sh_cmt_start_clocksource(struct sh_cmt_channel *ch) int ret = 0; unsigned long flags; - pm_runtime_get_sync(&ch->cmt->pdev->dev); - raw_spin_lock_irqsave(&ch->lock, flags); if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) @@ -619,8 +599,6 @@ static void sh_cmt_stop_clocksource(struct sh_cmt_channel *ch) sh_cmt_disable(ch); raw_spin_unlock_irqrestore(&ch->lock, flags); - - pm_runtime_put(&ch->cmt->pdev->dev); } static int sh_cmt_start_clockevent(struct sh_cmt_channel *ch) @@ -630,10 +608,8 @@ static int sh_cmt_start_clockevent(struct sh_cmt_channel *ch) raw_spin_lock_irqsave(&ch->lock, flags); - if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { - pm_runtime_get_sync(&ch->cmt->pdev->dev); + if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) ret = sh_cmt_enable(ch); - } if (ret) goto out; @@ -656,10 +632,8 @@ static void sh_cmt_stop_clockevent(struct sh_cmt_channel *ch) ch->flags &= ~FLAG_CLOCKEVENT; - if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) { + if (f && !(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE))) sh_cmt_disable(ch); - pm_runtime_put(&ch->cmt->pdev->dev); - } /* adjust the timeout to maximum if only clocksource left */ if (ch->flags & FLAG_CLOCKSOURCE) @@ -1134,8 +1108,6 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev) mask &= ~(1 << hwidx); } - clk_disable(cmt->clk); - platform_set_drvdata(pdev, cmt); return 0; @@ -1183,8 +1155,6 @@ static int sh_cmt_probe(struct platform_device *pdev) out: if (cmt->has_clockevent || cmt->has_clocksource) pm_runtime_irq_safe(&pdev->dev); - else - pm_runtime_idle(&pdev->dev); return 0; } diff --git a/drivers/clocksource/timer-nxp-pit.c b/drivers/clocksource/timer-nxp-pit.c index 2d0a3554b6bf..d1740f18f718 100644 --- a/drivers/clocksource/timer-nxp-pit.c +++ b/drivers/clocksource/timer-nxp-pit.c @@ -374,9 +374,10 @@ static struct platform_driver nxp_pit_driver = { .driver = { .name = "nxp-pit", .of_match_table = pit_timer_of_match, + .suppress_bind_attrs = true, }, .probe = pit_timer_probe, }; -module_platform_driver(nxp_pit_driver); +builtin_platform_driver(nxp_pit_driver); TIMER_OF_DECLARE(vf610, "fsl,vf610-pit", pit_timer_init); diff --git a/drivers/clocksource/timer-nxp-stm.c b/drivers/clocksource/timer-nxp-stm.c index bbc40623728f..1ab907233f48 100644 --- a/drivers/clocksource/timer-nxp-stm.c +++ b/drivers/clocksource/timer-nxp-stm.c @@ -177,15 +177,15 @@ static void nxp_stm_clocksource_resume(struct clocksource *cs) nxp_stm_clocksource_enable(cs); } -static void __init devm_clocksource_unregister(void *data) +static void devm_clocksource_unregister(void *data) { struct stm_timer *stm_timer = data; clocksource_unregister(&stm_timer->cs); } -static int __init nxp_stm_clocksource_init(struct device *dev, struct stm_timer *stm_timer, - const char *name, void __iomem *base, struct clk *clk) +static int nxp_stm_clocksource_init(struct device *dev, struct stm_timer *stm_timer, + const char *name, void __iomem *base, struct clk *clk) { int ret; @@ -208,10 +208,8 @@ static int __init nxp_stm_clocksource_init(struct device *dev, struct stm_timer return ret; ret = devm_add_action_or_reset(dev, devm_clocksource_unregister, stm_timer); - if (ret) { - clocksource_unregister(&stm_timer->cs); + if (ret) return ret; - } stm_sched_clock = stm_timer; @@ -298,9 +296,9 @@ static void nxp_stm_clockevent_resume(struct clock_event_device *ced) nxp_stm_module_get(stm_timer); } -static int __init nxp_stm_clockevent_per_cpu_init(struct device *dev, struct stm_timer *stm_timer, - const char *name, void __iomem *base, int irq, - struct clk *clk, int cpu) +static int nxp_stm_clockevent_per_cpu_init(struct device *dev, struct stm_timer *stm_timer, + const char *name, void __iomem *base, int irq, + struct clk *clk, int cpu) { stm_timer->base = base; stm_timer->rate = clk_get_rate(clk); @@ -388,7 +386,7 @@ static irqreturn_t nxp_stm_module_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static int __init nxp_stm_timer_probe(struct platform_device *pdev) +static int nxp_stm_timer_probe(struct platform_device *pdev) { struct stm_timer *stm_timer; struct device *dev = &pdev->dev; @@ -484,14 +482,15 @@ static const struct of_device_id nxp_stm_of_match[] = { }; MODULE_DEVICE_TABLE(of, nxp_stm_of_match); -static struct platform_driver nxp_stm_probe = { +static struct platform_driver nxp_stm_driver = { .probe = nxp_stm_timer_probe, .driver = { .name = "nxp-stm", .of_match_table = nxp_stm_of_match, + .suppress_bind_attrs = true, }, }; -module_platform_driver(nxp_stm_probe); +builtin_platform_driver(nxp_stm_driver); MODULE_DESCRIPTION("NXP System Timer Module driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/clocksource/timer-ralink.c b/drivers/clocksource/timer-ralink.c index 6ecdb4228f76..68434d9ed910 100644 --- a/drivers/clocksource/timer-ralink.c +++ b/drivers/clocksource/timer-ralink.c @@ -130,14 +130,15 @@ static int __init ralink_systick_init(struct device_node *np) systick.dev.irq = irq_of_parse_and_map(np, 0); if (!systick.dev.irq) { pr_err("%pOFn: request_irq failed", np); - return -EINVAL; + ret = -EINVAL; + goto err_iounmap; } ret = clocksource_mmio_init(systick.membase + SYSTICK_COUNT, np->name, SYSTICK_FREQ, 301, 16, clocksource_mmio_readl_up); if (ret) - return ret; + goto err_free_irq; clockevents_register_device(&systick.dev); @@ -145,6 +146,12 @@ static int __init ralink_systick_init(struct device_node *np) np, systick.dev.mult, systick.dev.shift); return 0; + +err_free_irq: + irq_dispose_mapping(systick.dev.irq); +err_iounmap: + iounmap(systick.membase); + return ret; } TIMER_OF_DECLARE(systick, "ralink,cevt-systick", ralink_systick_init); diff --git a/drivers/clocksource/timer-rda.c b/drivers/clocksource/timer-rda.c index fd1199c189bf..0be8e05970e2 100644 --- a/drivers/clocksource/timer-rda.c +++ b/drivers/clocksource/timer-rda.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/interrupt.h> +#include <linux/sched_clock.h> #include "timer-of.h" @@ -153,7 +154,7 @@ static struct timer_of rda_ostimer_of = { }, }; -static u64 rda_hwtimer_read(struct clocksource *cs) +static u64 rda_hwtimer_clocksource_read(void) { void __iomem *base = timer_of_base(&rda_ostimer_of); u32 lo, hi; @@ -167,6 +168,11 @@ static u64 rda_hwtimer_read(struct clocksource *cs) return ((u64)hi << 32) | lo; } +static u64 rda_hwtimer_read(struct clocksource *cs) +{ + return rda_hwtimer_clocksource_read(); +} + static struct clocksource rda_hwtimer_clocksource = { .name = "rda-timer", .rating = 400, @@ -185,6 +191,7 @@ static int __init rda_timer_init(struct device_node *np) return ret; clocksource_register_hz(&rda_hwtimer_clocksource, rate); + sched_clock_register(rda_hwtimer_clocksource_read, 64, rate); clockevents_config_and_register(&rda_ostimer_of.clkevt, rate, 0x2, UINT_MAX); diff --git a/drivers/clocksource/timer-realtek.c b/drivers/clocksource/timer-realtek.c new file mode 100644 index 000000000000..4f0439de9939 --- /dev/null +++ b/drivers/clocksource/timer-realtek.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025 Realtek Semiconductor Corp. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/irqflags.h> +#include <linux/interrupt.h> +#include "timer-of.h" + +#define ENBL 1 +#define DSBL 0 + +#define SYSTIMER_RATE 1000000 +#define SYSTIMER_MIN_DELTA 0x64 +#define SYSTIMER_MAX_DELTA ULONG_MAX + +/* SYSTIMER Register Offset (RTK Internal Use) */ +#define TS_LW_OFST 0x0 +#define TS_HW_OFST 0x4 +#define TS_CMP_VAL_LW_OFST 0x8 +#define TS_CMP_VAL_HW_OFST 0xC +#define TS_CMP_CTRL_OFST 0x10 +#define TS_CMP_STAT_OFST 0x14 + +/* SYSTIMER CMP CTRL REG Mask */ +#define TS_CMP_EN_MASK 0x1 +#define TS_WR_EN0_MASK 0x2 + +static void __iomem *systimer_base; + +static u64 rtk_ts64_read(void) +{ + u32 low, high; + u64 ts; + + /* Caution: Read LSB word (TS_LW_OFST) first then MSB (TS_HW_OFST) */ + low = readl(systimer_base + TS_LW_OFST); + high = readl(systimer_base + TS_HW_OFST); + ts = ((u64)high << 32) | low; + + return ts; +} + +static void rtk_cmp_value_write(u64 value) +{ + u32 high, low; + + low = value & 0xFFFFFFFF; + high = value >> 32; + + writel(high, systimer_base + TS_CMP_VAL_HW_OFST); + writel(low, systimer_base + TS_CMP_VAL_LW_OFST); +} + +static inline void rtk_cmp_en_write(bool cmp_en) +{ + u32 val; + + val = TS_WR_EN0_MASK; + if (cmp_en == ENBL) + val |= TS_CMP_EN_MASK; + + writel(val, systimer_base + TS_CMP_CTRL_OFST); +} + +static int rtk_syst_clkevt_next_event(unsigned long cycles, struct clock_event_device *clkevt) +{ + u64 cmp_val; + + rtk_cmp_en_write(DSBL); + cmp_val = rtk_ts64_read(); + + /* Set CMP value to current timestamp plus delta_us */ + rtk_cmp_value_write(cmp_val + cycles); + rtk_cmp_en_write(ENBL); + return 0; +} + +static irqreturn_t rtk_ts_match_intr_handler(int irq, void *dev_id) +{ + struct clock_event_device *clkevt = dev_id; + void __iomem *reg_base; + u32 val; + + /* Disable TS CMP Match */ + rtk_cmp_en_write(DSBL); + + /* Clear TS CMP INTR */ + reg_base = systimer_base + TS_CMP_STAT_OFST; + val = readl(reg_base) & TS_CMP_EN_MASK; + writel(val | TS_CMP_EN_MASK, reg_base); + clkevt->event_handler(clkevt); + + return IRQ_HANDLED; +} + +static int rtk_syst_shutdown(struct clock_event_device *clkevt) +{ + void __iomem *reg_base; + u64 cmp_val = 0; + + /* Disable TS CMP Match */ + rtk_cmp_en_write(DSBL); + /* Set compare value to 0 */ + rtk_cmp_value_write(cmp_val); + + /* Clear TS CMP INTR */ + reg_base = systimer_base + TS_CMP_STAT_OFST; + writel(TS_CMP_EN_MASK, reg_base); + return 0; +} + +static struct timer_of rtk_timer_to = { + .flags = TIMER_OF_IRQ | TIMER_OF_BASE, + + .clkevt = { + .name = "rtk-clkevt", + .rating = 300, + .cpumask = cpu_possible_mask, + .features = CLOCK_EVT_FEAT_DYNIRQ | + CLOCK_EVT_FEAT_ONESHOT, + .set_next_event = rtk_syst_clkevt_next_event, + .set_state_oneshot = rtk_syst_shutdown, + .set_state_shutdown = rtk_syst_shutdown, + }, + + .of_irq = { + .flags = IRQF_TIMER | IRQF_IRQPOLL, + .handler = rtk_ts_match_intr_handler, + }, +}; + +static int __init rtk_systimer_init(struct device_node *node) +{ + int ret; + + ret = timer_of_init(node, &rtk_timer_to); + if (ret) + return ret; + + systimer_base = timer_of_base(&rtk_timer_to); + clockevents_config_and_register(&rtk_timer_to.clkevt, SYSTIMER_RATE, + SYSTIMER_MIN_DELTA, SYSTIMER_MAX_DELTA); + + return 0; +} + +TIMER_OF_DECLARE(rtk_systimer, "realtek,rtd1625-systimer", rtk_systimer_init); diff --git a/drivers/clocksource/timer-sp804.c b/drivers/clocksource/timer-sp804.c index cd1916c05325..e82a95ea4724 100644 --- a/drivers/clocksource/timer-sp804.c +++ b/drivers/clocksource/timer-sp804.c @@ -21,6 +21,10 @@ #include <linux/of_irq.h> #include <linux/sched_clock.h> +#ifdef CONFIG_ARM +#include <linux/delay.h> +#endif + #include "timer-sp.h" /* Hisilicon 64-bit timer(a variant of ARM SP804) */ @@ -102,6 +106,23 @@ static u64 notrace sp804_read(void) return ~readl_relaxed(sched_clkevt->value); } +#ifdef CONFIG_ARM +static struct delay_timer delay; +static unsigned long sp804_read_delay_timer_read(void) +{ + return sp804_read(); +} + +static void sp804_register_delay_timer(int freq) +{ + delay.freq = freq; + delay.read_current_timer = sp804_read_delay_timer_read; + register_current_timer_delay(&delay); +} +#else +static inline void sp804_register_delay_timer(int freq) {} +#endif + static int __init sp804_clocksource_and_sched_clock_init(void __iomem *base, const char *name, struct clk *clk, @@ -114,6 +135,8 @@ static int __init sp804_clocksource_and_sched_clock_init(void __iomem *base, if (rate < 0) return -EINVAL; + sp804_register_delay_timer(rate); + clkevt = sp804_clkevt_get(base); writel(0, clkevt->ctrl); @@ -318,6 +341,7 @@ static int __init sp804_of_init(struct device_node *np, struct sp804_timer *time if (ret) goto err; } + initialized = true; return 0; diff --git a/drivers/clocksource/timer-sprd.c b/drivers/clocksource/timer-sprd.c index 430cb99d8d79..2c07dd2af760 100644 --- a/drivers/clocksource/timer-sprd.c +++ b/drivers/clocksource/timer-sprd.c @@ -30,6 +30,7 @@ #define TIMER_VALUE_SHDW_HI 0x1c #define TIMER_VALUE_LO_MASK GENMASK(31, 0) +#define TIMER_VALUE_HI_MASK GENMASK(31, 0) static void sprd_timer_enable(void __iomem *base, u32 flag) { @@ -162,15 +163,26 @@ static struct timer_of suspend_to = { static u64 sprd_suspend_timer_read(struct clocksource *cs) { - return ~(u64)readl_relaxed(timer_of_base(&suspend_to) + - TIMER_VALUE_SHDW_LO) & cs->mask; + u32 lo, hi; + + do { + hi = readl_relaxed(timer_of_base(&suspend_to) + + TIMER_VALUE_SHDW_HI); + lo = readl_relaxed(timer_of_base(&suspend_to) + + TIMER_VALUE_SHDW_LO); + } while (hi != readl_relaxed(timer_of_base(&suspend_to) + TIMER_VALUE_SHDW_HI)); + + return ~(((u64)hi << 32) | lo); } static int sprd_suspend_timer_enable(struct clocksource *cs) { - sprd_timer_update_counter(timer_of_base(&suspend_to), - TIMER_VALUE_LO_MASK); - sprd_timer_enable(timer_of_base(&suspend_to), TIMER_CTL_PERIOD_MODE); + writel_relaxed(TIMER_VALUE_LO_MASK, + timer_of_base(&suspend_to) + TIMER_LOAD_LO); + writel_relaxed(TIMER_VALUE_HI_MASK, + timer_of_base(&suspend_to) + TIMER_LOAD_HI); + sprd_timer_enable(timer_of_base(&suspend_to), + TIMER_CTL_PERIOD_MODE|TIMER_CTL_64BIT_WIDTH); return 0; } @@ -186,7 +198,7 @@ static struct clocksource suspend_clocksource = { .read = sprd_suspend_timer_read, .enable = sprd_suspend_timer_enable, .disable = sprd_suspend_timer_disable, - .mask = CLOCKSOURCE_MASK(32), + .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_SUSPEND_NONSTOP, }; diff --git a/drivers/clocksource/timer-stm32-lp.c b/drivers/clocksource/timer-stm32-lp.c index c2a699f5c1dd..3d804128c765 100644 --- a/drivers/clocksource/timer-stm32-lp.c +++ b/drivers/clocksource/timer-stm32-lp.c @@ -289,5 +289,4 @@ static struct platform_driver stm32_clkevent_lp_driver = { }; module_platform_driver(stm32_clkevent_lp_driver); -MODULE_ALIAS("platform:stm32-lptimer-timer"); MODULE_DESCRIPTION("STMicroelectronics STM32 clockevent low power driver"); diff --git a/drivers/counter/microchip-tcb-capture.c b/drivers/counter/microchip-tcb-capture.c index 1a299d1f350b..19d457ae4c3b 100644 --- a/drivers/counter/microchip-tcb-capture.c +++ b/drivers/counter/microchip-tcb-capture.c @@ -451,7 +451,7 @@ static void mchp_tc_irq_remove(void *ptr) static int mchp_tc_irq_enable(struct counter_device *const counter, int irq) { struct mchp_tc_data *const priv = counter_priv(counter); - int ret = devm_request_irq(counter->parent, irq, mchp_tc_isr, 0, + int ret = devm_request_irq(counter->parent, irq, mchp_tc_isr, IRQF_SHARED, dev_name(counter->parent), counter); if (ret < 0) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 38897bb14a2c..492a10f1bdbf 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -603,9 +603,6 @@ static bool turbo_is_disabled(void) { u64 misc_en; - if (!cpu_feature_enabled(X86_FEATURE_IDA)) - return true; - rdmsrq(MSR_IA32_MISC_ENABLE, misc_en); return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); @@ -2106,7 +2103,8 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate) u32 vid; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; vid_fp = cpudata->vid.min + mul_fp( @@ -2271,7 +2269,8 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) u64 val; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; return val; diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index a360bc4d20b7..19be6475d356 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/of.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/platform_device.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> @@ -303,8 +304,8 @@ static int sbi_cpuidle_init_cpu(struct device *dev, int cpu) drv->states[0].exit_latency = 1; drv->states[0].target_residency = 1; drv->states[0].power_usage = UINT_MAX; - strcpy(drv->states[0].name, "WFI"); - strcpy(drv->states[0].desc, "RISC-V WFI"); + strscpy(drv->states[0].name, "WFI"); + strscpy(drv->states[0].desc, "RISC-V WFI"); /* * If no DT idle states are detected (ret == 0) let the driver diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..b28a6f50daaa 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -259,27 +259,20 @@ static int sev_cmd_buffer_len(int cmd) static struct file *open_file_as_root(const char *filename, int flags, umode_t mode) { - struct file *fp; - struct path root; - struct cred *cred; - const struct cred *old_cred; + struct path root __free(path_put) = {}; task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cred = prepare_creds(); + CLASS(prepare_creds, cred)(); if (!cred) return ERR_PTR(-ENOMEM); - cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - fp = file_open_root(&root, filename, flags, mode); - path_put(&root); - - put_cred(revert_creds(old_cred)); + cred->fsuid = GLOBAL_ROOT_UID; - return fp; + scoped_with_creds(cred) + return file_open_root(&root, filename, flags, mode); } static int sev_read_init_ex_file(void) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index a5b96adf2d1e..3b391a146635 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3871,10 +3871,12 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf, pdev = container_of(dev, struct pci_dev, dev); if (pci_physfn(pdev) != qm->pdev) { pci_err(qm->pdev, "the pdev input does not match the pf!\n"); + put_device(dev); return -EINVAL; } *fun_index = pdev->devfn; + put_device(dev); return 0; } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index b06fee1978ba..41b64d871c5a 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3702,6 +3702,7 @@ static int cxl_region_debugfs_poison_inject(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, @@ -3734,6 +3735,7 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset) if (validate_region_offset(cxlr, offset)) return -EINVAL; + offset -= cxlr->params.cache_size; rc = region_offset_to_dpa_result(cxlr, offset, &result); if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { dev_dbg(&cxlr->dev, diff --git a/drivers/dax/super.c b/drivers/dax/super.c index d7714d8afb0f..c00b9dff4a06 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -433,7 +433,7 @@ static struct dax_device *dax_dev_get(dev_t devt) return NULL; dax_dev = to_dax_dev(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { set_bit(DAXDEV_ALIVE, &dax_dev->flags); inode->i_cdev = &dax_dev->cdev; inode->i_mode = S_IFCHR; diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 2bcf9ceca997..edaa9e4ee4ae 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -768,18 +768,10 @@ EXPORT_SYMBOL_NS_GPL(dma_buf_export, "DMA_BUF"); */ int dma_buf_fd(struct dma_buf *dmabuf, int flags) { - int fd; - if (!dmabuf || !dmabuf->file) return -EINVAL; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - fd_install(fd, dmabuf->file); - - return fd; + return FD_ADD(flags, dmabuf->file); } EXPORT_SYMBOL_NS_GPL(dma_buf_fd, "DMA_BUF"); diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 39352b9b7a7e..81e40543ffd8 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -23,14 +23,6 @@ menuconfig EDAC if EDAC -config EDAC_LEGACY_SYSFS - bool "EDAC legacy sysfs" - default y - help - Enable the compatibility sysfs nodes. - Use 'Y' if your edac utilities aren't ported to work with the newer - structures. - config EDAC_DEBUG bool "Debugging" select DEBUG_FS @@ -291,6 +283,18 @@ config EDAC_I10NM system has non-volatile DIMMs you should also manually select CONFIG_ACPI_NFIT. +config EDAC_IMH + tristate "Intel Integrated Memory/IO Hub MC" + depends on X86_64 && X86_MCE_INTEL && ACPI + depends on ACPI_NFIT || !ACPI_NFIT # if ACPI_NFIT=m, EDAC_IMH can't be y + select DMI + select ACPI_ADXL + help + Support for error detection and correction the Intel + Integrated Memory/IO Hub Memory Controller. This MC IP is + first used on the Diamond Rapids servers but may appear on + others in the future. + config EDAC_PND2 tristate "Intel Pondicherry2" depends on PCI && X86_64 && X86_MCE_INTEL diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 1c14796410a3..8429b1e856bc 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -65,6 +65,9 @@ obj-$(CONFIG_EDAC_SKX) += skx_edac.o skx_edac_common.o i10nm_edac-y := i10nm_base.o obj-$(CONFIG_EDAC_I10NM) += i10nm_edac.o skx_edac_common.o +imh_edac-y := imh_base.o +obj-$(CONFIG_EDAC_IMH) += imh_edac.o skx_edac_common.o + obj-$(CONFIG_EDAC_HIGHBANK_MC) += highbank_mc_edac.o obj-$(CONFIG_EDAC_HIGHBANK_L2) += highbank_l2_edac.o diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 103b2c2eba2a..0c5b94e64ea1 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1184,10 +1184,22 @@ altr_check_ocram_deps_init(struct altr_edac_device_dev *device) if (ret) return ret; - /* Verify OCRAM has been initialized */ + /* + * Verify that OCRAM has been initialized. + * During a warm reset, OCRAM contents are retained, but the control + * and status registers are reset to their default values. Therefore, + * ECC must be explicitly re-enabled in the control register. + * Error condition: if INITCOMPLETEA is clear and ECC_EN is already set. + */ if (!ecc_test_bits(ALTR_A10_ECC_INITCOMPLETEA, - (base + ALTR_A10_ECC_INITSTAT_OFST))) - return -ENODEV; + (base + ALTR_A10_ECC_INITSTAT_OFST))) { + if (!ecc_test_bits(ALTR_A10_ECC_EN, + (base + ALTR_A10_ECC_CTRL_OFST))) + ecc_set_bits(ALTR_A10_ECC_EN, + (base + ALTR_A10_ECC_CTRL_OFST)); + else + return -ENODEV; + } /* Enable IRQ on Single Bit Error */ writel(ALTR_A10_ECC_SERRINTEN, (base + ALTR_A10_ECC_ERRINTENS_OFST)); @@ -1357,7 +1369,7 @@ static const struct edac_device_prv_data a10_enetecc_data = { .ue_set_mask = ALTR_A10_ECC_TDERRA, .set_err_ofst = ALTR_A10_ECC_INTTEST_OFST, .ecc_irq_handler = altr_edac_a10_ecc_irq, - .inject_fops = &altr_edac_a10_device_inject2_fops, + .inject_fops = &altr_edac_a10_device_inject_fops, }; #endif /* CONFIG_EDAC_ALTERA_ETHERNET */ @@ -1447,7 +1459,7 @@ static const struct edac_device_prv_data a10_usbecc_data = { .ue_set_mask = ALTR_A10_ECC_TDERRA, .set_err_ofst = ALTR_A10_ECC_INTTEST_OFST, .ecc_irq_handler = altr_edac_a10_ecc_irq, - .inject_fops = &altr_edac_a10_device_inject2_fops, + .inject_fops = &altr_edac_a10_device_inject_fops, }; #endif /* CONFIG_EDAC_ALTERA_USB */ diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2f6ab783bf20..2391f3469961 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3732,6 +3732,7 @@ static void hw_info_put(struct amd64_pvt *pvt) pci_dev_put(pvt->F1); pci_dev_put(pvt->F2); kfree(pvt->umc); + kfree(pvt->csels); } static struct low_ops umc_ops = { @@ -3766,6 +3767,7 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->stepping = boot_cpu_data.x86_stepping; pvt->model = boot_cpu_data.x86_model; pvt->fam = boot_cpu_data.x86; + char *tmp_name = NULL; pvt->max_mcs = 2; /* @@ -3779,7 +3781,7 @@ static int per_family_init(struct amd64_pvt *pvt) switch (pvt->fam) { case 0xf: - pvt->ctl_name = (pvt->ext_model >= K8_REV_F) ? + tmp_name = (pvt->ext_model >= K8_REV_F) ? "K8 revF or later" : "K8 revE or earlier"; pvt->f1_id = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP; pvt->f2_id = PCI_DEVICE_ID_AMD_K8_NB_MEMCTL; @@ -3788,7 +3790,6 @@ static int per_family_init(struct amd64_pvt *pvt) break; case 0x10: - pvt->ctl_name = "F10h"; pvt->f1_id = PCI_DEVICE_ID_AMD_10H_NB_MAP; pvt->f2_id = PCI_DEVICE_ID_AMD_10H_NB_DRAM; pvt->ops->dbam_to_cs = f10_dbam_to_chip_select; @@ -3797,12 +3798,10 @@ static int per_family_init(struct amd64_pvt *pvt) case 0x15: switch (pvt->model) { case 0x30: - pvt->ctl_name = "F15h_M30h"; pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F1; pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M30H_NB_F2; break; case 0x60: - pvt->ctl_name = "F15h_M60h"; pvt->f1_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F1; pvt->f2_id = PCI_DEVICE_ID_AMD_15H_M60H_NB_F2; pvt->ops->dbam_to_cs = f15_m60h_dbam_to_chip_select; @@ -3811,7 +3810,6 @@ static int per_family_init(struct amd64_pvt *pvt) /* Richland is only client */ return -ENODEV; default: - pvt->ctl_name = "F15h"; pvt->f1_id = PCI_DEVICE_ID_AMD_15H_NB_F1; pvt->f2_id = PCI_DEVICE_ID_AMD_15H_NB_F2; pvt->ops->dbam_to_cs = f15_dbam_to_chip_select; @@ -3822,12 +3820,10 @@ static int per_family_init(struct amd64_pvt *pvt) case 0x16: switch (pvt->model) { case 0x30: - pvt->ctl_name = "F16h_M30h"; pvt->f1_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F1; pvt->f2_id = PCI_DEVICE_ID_AMD_16H_M30H_NB_F2; break; default: - pvt->ctl_name = "F16h"; pvt->f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1; pvt->f2_id = PCI_DEVICE_ID_AMD_16H_NB_F2; break; @@ -3836,76 +3832,51 @@ static int per_family_init(struct amd64_pvt *pvt) case 0x17: switch (pvt->model) { - case 0x10 ... 0x2f: - pvt->ctl_name = "F17h_M10h"; - break; case 0x30 ... 0x3f: - pvt->ctl_name = "F17h_M30h"; pvt->max_mcs = 8; break; - case 0x60 ... 0x6f: - pvt->ctl_name = "F17h_M60h"; - break; - case 0x70 ... 0x7f: - pvt->ctl_name = "F17h_M70h"; - break; default: - pvt->ctl_name = "F17h"; break; } break; case 0x18: - pvt->ctl_name = "F18h"; break; case 0x19: switch (pvt->model) { case 0x00 ... 0x0f: - pvt->ctl_name = "F19h"; pvt->max_mcs = 8; break; case 0x10 ... 0x1f: - pvt->ctl_name = "F19h_M10h"; pvt->max_mcs = 12; pvt->flags.zn_regs_v2 = 1; break; - case 0x20 ... 0x2f: - pvt->ctl_name = "F19h_M20h"; - break; case 0x30 ... 0x3f: if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) { - pvt->ctl_name = "MI200"; + tmp_name = "MI200"; pvt->max_mcs = 4; pvt->dram_type = MEM_HBM2; pvt->gpu_umc_base = 0x50000; pvt->ops = &gpu_ops; } else { - pvt->ctl_name = "F19h_M30h"; pvt->max_mcs = 8; } break; - case 0x50 ... 0x5f: - pvt->ctl_name = "F19h_M50h"; - break; case 0x60 ... 0x6f: - pvt->ctl_name = "F19h_M60h"; pvt->flags.zn_regs_v2 = 1; break; case 0x70 ... 0x7f: - pvt->ctl_name = "F19h_M70h"; pvt->max_mcs = 4; pvt->flags.zn_regs_v2 = 1; break; case 0x90 ... 0x9f: - pvt->ctl_name = "F19h_M90h"; pvt->max_mcs = 4; pvt->dram_type = MEM_HBM3; pvt->gpu_umc_base = 0x90000; pvt->ops = &gpu_ops; break; case 0xa0 ... 0xaf: - pvt->ctl_name = "F19h_MA0h"; pvt->max_mcs = 12; pvt->flags.zn_regs_v2 = 1; break; @@ -3915,34 +3886,22 @@ static int per_family_init(struct amd64_pvt *pvt) case 0x1A: switch (pvt->model) { case 0x00 ... 0x1f: - pvt->ctl_name = "F1Ah"; pvt->max_mcs = 12; pvt->flags.zn_regs_v2 = 1; break; case 0x40 ... 0x4f: - pvt->ctl_name = "F1Ah_M40h"; pvt->flags.zn_regs_v2 = 1; break; case 0x50 ... 0x57: - pvt->ctl_name = "F1Ah_M50h"; + case 0xc0 ... 0xc7: pvt->max_mcs = 16; pvt->flags.zn_regs_v2 = 1; break; case 0x90 ... 0x9f: - pvt->ctl_name = "F1Ah_M90h"; - pvt->max_mcs = 8; - pvt->flags.zn_regs_v2 = 1; - break; case 0xa0 ... 0xaf: - pvt->ctl_name = "F1Ah_MA0h"; pvt->max_mcs = 8; pvt->flags.zn_regs_v2 = 1; break; - case 0xc0 ... 0xc7: - pvt->ctl_name = "F1Ah_MC0h"; - pvt->max_mcs = 16; - pvt->flags.zn_regs_v2 = 1; - break; } break; @@ -3951,6 +3910,16 @@ static int per_family_init(struct amd64_pvt *pvt) return -ENODEV; } + if (tmp_name) + scnprintf(pvt->ctl_name, sizeof(pvt->ctl_name), tmp_name); + else + scnprintf(pvt->ctl_name, sizeof(pvt->ctl_name), "F%02Xh_M%02Xh", + pvt->fam, pvt->model); + + pvt->csels = kcalloc(pvt->max_mcs, sizeof(*pvt->csels), GFP_KERNEL); + if (!pvt->csels) + return -ENOMEM; + return 0; } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index d70b8a8d0b09..1757c1b99fc8 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,11 +96,12 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 -#define NUM_CONTROLLERS 16 #define ON true #define OFF false +#define MAX_CTL_NAMELEN 19 + /* * PCI-defined configuration space registers */ @@ -346,7 +347,7 @@ struct amd64_pvt { u32 dbam1; /* DRAM Base Address Mapping reg for DCT1 */ /* one for each DCT/UMC */ - struct chip_select csels[NUM_CONTROLLERS]; + struct chip_select *csels; /* DRAM base and limit pairs F1x[78,70,68,60,58,50,48,40] */ struct dram_range ranges[DRAM_RANGES]; @@ -362,7 +363,7 @@ struct amd64_pvt { /* x4, x8, or x16 syndromes in use */ u8 ecc_sym_sz; - const char *ctl_name; + char ctl_name[MAX_CTL_NAMELEN]; u16 f1_id, f2_id; /* Maximum number of memory controllers per die/node. */ u8 max_mcs; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 8689631f1905..091cc6aae8a9 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -115,401 +115,6 @@ static const char * const edac_caps[] = { [EDAC_S16ECD16ED] = "S16ECD16ED" }; -#ifdef CONFIG_EDAC_LEGACY_SYSFS -/* - * EDAC sysfs CSROW data structures and methods - */ - -#define to_csrow(k) container_of(k, struct csrow_info, dev) - -/* - * We need it to avoid namespace conflicts between the legacy API - * and the per-dimm/per-rank one - */ -#define DEVICE_ATTR_LEGACY(_name, _mode, _show, _store) \ - static struct device_attribute dev_attr_legacy_##_name = __ATTR(_name, _mode, _show, _store) - -struct dev_ch_attribute { - struct device_attribute attr; - unsigned int channel; -}; - -#define DEVICE_CHANNEL(_name, _mode, _show, _store, _var) \ - static struct dev_ch_attribute dev_attr_legacy_##_name = \ - { __ATTR(_name, _mode, _show, _store), (_var) } - -#define to_channel(k) (container_of(k, struct dev_ch_attribute, attr)->channel) - -/* Set of more default csrow<id> attribute show/store functions */ -static ssize_t csrow_ue_count_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - - return sysfs_emit(data, "%u\n", csrow->ue_count); -} - -static ssize_t csrow_ce_count_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - - return sysfs_emit(data, "%u\n", csrow->ce_count); -} - -static ssize_t csrow_size_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - int i; - u32 nr_pages = 0; - - for (i = 0; i < csrow->nr_channels; i++) - nr_pages += csrow->channels[i]->dimm->nr_pages; - return sysfs_emit(data, "%u\n", PAGES_TO_MiB(nr_pages)); -} - -static ssize_t csrow_mem_type_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - - return sysfs_emit(data, "%s\n", edac_mem_types[csrow->channels[0]->dimm->mtype]); -} - -static ssize_t csrow_dev_type_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - - return sysfs_emit(data, "%s\n", dev_types[csrow->channels[0]->dimm->dtype]); -} - -static ssize_t csrow_edac_mode_show(struct device *dev, - struct device_attribute *mattr, - char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - - return sysfs_emit(data, "%s\n", edac_caps[csrow->channels[0]->dimm->edac_mode]); -} - -/* show/store functions for DIMM Label attributes */ -static ssize_t channel_dimm_label_show(struct device *dev, - struct device_attribute *mattr, - char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - unsigned int chan = to_channel(mattr); - struct rank_info *rank = csrow->channels[chan]; - - /* if field has not been initialized, there is nothing to send */ - if (!rank->dimm->label[0]) - return 0; - - return sysfs_emit(data, "%s\n", rank->dimm->label); -} - -static ssize_t channel_dimm_label_store(struct device *dev, - struct device_attribute *mattr, - const char *data, size_t count) -{ - struct csrow_info *csrow = to_csrow(dev); - unsigned int chan = to_channel(mattr); - struct rank_info *rank = csrow->channels[chan]; - size_t copy_count = count; - - if (count == 0) - return -EINVAL; - - if (data[count - 1] == '\0' || data[count - 1] == '\n') - copy_count -= 1; - - if (copy_count == 0 || copy_count >= sizeof(rank->dimm->label)) - return -EINVAL; - - memcpy(rank->dimm->label, data, copy_count); - rank->dimm->label[copy_count] = '\0'; - - return count; -} - -/* show function for dynamic chX_ce_count attribute */ -static ssize_t channel_ce_count_show(struct device *dev, - struct device_attribute *mattr, char *data) -{ - struct csrow_info *csrow = to_csrow(dev); - unsigned int chan = to_channel(mattr); - struct rank_info *rank = csrow->channels[chan]; - - return sysfs_emit(data, "%u\n", rank->ce_count); -} - -/* cwrow<id>/attribute files */ -DEVICE_ATTR_LEGACY(size_mb, S_IRUGO, csrow_size_show, NULL); -DEVICE_ATTR_LEGACY(dev_type, S_IRUGO, csrow_dev_type_show, NULL); -DEVICE_ATTR_LEGACY(mem_type, S_IRUGO, csrow_mem_type_show, NULL); -DEVICE_ATTR_LEGACY(edac_mode, S_IRUGO, csrow_edac_mode_show, NULL); -DEVICE_ATTR_LEGACY(ue_count, S_IRUGO, csrow_ue_count_show, NULL); -DEVICE_ATTR_LEGACY(ce_count, S_IRUGO, csrow_ce_count_show, NULL); - -/* default attributes of the CSROW<id> object */ -static struct attribute *csrow_attrs[] = { - &dev_attr_legacy_dev_type.attr, - &dev_attr_legacy_mem_type.attr, - &dev_attr_legacy_edac_mode.attr, - &dev_attr_legacy_size_mb.attr, - &dev_attr_legacy_ue_count.attr, - &dev_attr_legacy_ce_count.attr, - NULL, -}; - -static const struct attribute_group csrow_attr_grp = { - .attrs = csrow_attrs, -}; - -static const struct attribute_group *csrow_attr_groups[] = { - &csrow_attr_grp, - NULL -}; - -static const struct device_type csrow_attr_type = { - .groups = csrow_attr_groups, -}; - -/* - * possible dynamic channel DIMM Label attribute files - * - */ -DEVICE_CHANNEL(ch0_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 0); -DEVICE_CHANNEL(ch1_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 1); -DEVICE_CHANNEL(ch2_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 2); -DEVICE_CHANNEL(ch3_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 3); -DEVICE_CHANNEL(ch4_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 4); -DEVICE_CHANNEL(ch5_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 5); -DEVICE_CHANNEL(ch6_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 6); -DEVICE_CHANNEL(ch7_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 7); -DEVICE_CHANNEL(ch8_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 8); -DEVICE_CHANNEL(ch9_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 9); -DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 10); -DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 11); -DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 12); -DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 13); -DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 14); -DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, - channel_dimm_label_show, channel_dimm_label_store, 15); - -/* Total possible dynamic DIMM Label attribute file table */ -static struct attribute *dynamic_csrow_dimm_attr[] = { - &dev_attr_legacy_ch0_dimm_label.attr.attr, - &dev_attr_legacy_ch1_dimm_label.attr.attr, - &dev_attr_legacy_ch2_dimm_label.attr.attr, - &dev_attr_legacy_ch3_dimm_label.attr.attr, - &dev_attr_legacy_ch4_dimm_label.attr.attr, - &dev_attr_legacy_ch5_dimm_label.attr.attr, - &dev_attr_legacy_ch6_dimm_label.attr.attr, - &dev_attr_legacy_ch7_dimm_label.attr.attr, - &dev_attr_legacy_ch8_dimm_label.attr.attr, - &dev_attr_legacy_ch9_dimm_label.attr.attr, - &dev_attr_legacy_ch10_dimm_label.attr.attr, - &dev_attr_legacy_ch11_dimm_label.attr.attr, - &dev_attr_legacy_ch12_dimm_label.attr.attr, - &dev_attr_legacy_ch13_dimm_label.attr.attr, - &dev_attr_legacy_ch14_dimm_label.attr.attr, - &dev_attr_legacy_ch15_dimm_label.attr.attr, - NULL -}; - -/* possible dynamic channel ce_count attribute files */ -DEVICE_CHANNEL(ch0_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 0); -DEVICE_CHANNEL(ch1_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 1); -DEVICE_CHANNEL(ch2_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 2); -DEVICE_CHANNEL(ch3_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 3); -DEVICE_CHANNEL(ch4_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 4); -DEVICE_CHANNEL(ch5_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 5); -DEVICE_CHANNEL(ch6_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 6); -DEVICE_CHANNEL(ch7_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 7); -DEVICE_CHANNEL(ch8_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 8); -DEVICE_CHANNEL(ch9_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 9); -DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 10); -DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 11); -DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 12); -DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 13); -DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 14); -DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, - channel_ce_count_show, NULL, 15); - -/* Total possible dynamic ce_count attribute file table */ -static struct attribute *dynamic_csrow_ce_count_attr[] = { - &dev_attr_legacy_ch0_ce_count.attr.attr, - &dev_attr_legacy_ch1_ce_count.attr.attr, - &dev_attr_legacy_ch2_ce_count.attr.attr, - &dev_attr_legacy_ch3_ce_count.attr.attr, - &dev_attr_legacy_ch4_ce_count.attr.attr, - &dev_attr_legacy_ch5_ce_count.attr.attr, - &dev_attr_legacy_ch6_ce_count.attr.attr, - &dev_attr_legacy_ch7_ce_count.attr.attr, - &dev_attr_legacy_ch8_ce_count.attr.attr, - &dev_attr_legacy_ch9_ce_count.attr.attr, - &dev_attr_legacy_ch10_ce_count.attr.attr, - &dev_attr_legacy_ch11_ce_count.attr.attr, - &dev_attr_legacy_ch12_ce_count.attr.attr, - &dev_attr_legacy_ch13_ce_count.attr.attr, - &dev_attr_legacy_ch14_ce_count.attr.attr, - &dev_attr_legacy_ch15_ce_count.attr.attr, - NULL -}; - -static umode_t csrow_dev_is_visible(struct kobject *kobj, - struct attribute *attr, int idx) -{ - struct device *dev = kobj_to_dev(kobj); - struct csrow_info *csrow = container_of(dev, struct csrow_info, dev); - - if (idx >= csrow->nr_channels) - return 0; - - if (idx >= ARRAY_SIZE(dynamic_csrow_ce_count_attr) - 1) { - WARN_ONCE(1, "idx: %d\n", idx); - return 0; - } - - /* Only expose populated DIMMs */ - if (!csrow->channels[idx]->dimm->nr_pages) - return 0; - - return attr->mode; -} - - -static const struct attribute_group csrow_dev_dimm_group = { - .attrs = dynamic_csrow_dimm_attr, - .is_visible = csrow_dev_is_visible, -}; - -static const struct attribute_group csrow_dev_ce_count_group = { - .attrs = dynamic_csrow_ce_count_attr, - .is_visible = csrow_dev_is_visible, -}; - -static const struct attribute_group *csrow_dev_groups[] = { - &csrow_dev_dimm_group, - &csrow_dev_ce_count_group, - NULL -}; - -static void csrow_release(struct device *dev) -{ - /* - * Nothing to do, just unregister sysfs here. The mci - * device owns the data and will also release it. - */ -} - -static inline int nr_pages_per_csrow(struct csrow_info *csrow) -{ - int chan, nr_pages = 0; - - for (chan = 0; chan < csrow->nr_channels; chan++) - nr_pages += csrow->channels[chan]->dimm->nr_pages; - - return nr_pages; -} - -/* Create a CSROW object under specified edac_mc_device */ -static int edac_create_csrow_object(struct mem_ctl_info *mci, - struct csrow_info *csrow, int index) -{ - int err; - - csrow->dev.type = &csrow_attr_type; - csrow->dev.groups = csrow_dev_groups; - csrow->dev.release = csrow_release; - device_initialize(&csrow->dev); - csrow->dev.parent = &mci->dev; - csrow->mci = mci; - dev_set_name(&csrow->dev, "csrow%d", index); - dev_set_drvdata(&csrow->dev, csrow); - - err = device_add(&csrow->dev); - if (err) { - edac_dbg(1, "failure: create device %s\n", dev_name(&csrow->dev)); - put_device(&csrow->dev); - return err; - } - - edac_dbg(0, "device %s created\n", dev_name(&csrow->dev)); - - return 0; -} - -/* Create a CSROW object under specified edac_mc_device */ -static int edac_create_csrow_objects(struct mem_ctl_info *mci) -{ - int err, i; - struct csrow_info *csrow; - - for (i = 0; i < mci->nr_csrows; i++) { - csrow = mci->csrows[i]; - if (!nr_pages_per_csrow(csrow)) - continue; - err = edac_create_csrow_object(mci, mci->csrows[i], i); - if (err < 0) - goto error; - } - return 0; - -error: - for (--i; i >= 0; i--) { - if (device_is_registered(&mci->csrows[i]->dev)) - device_unregister(&mci->csrows[i]->dev); - } - - return err; -} - -static void edac_delete_csrow_objects(struct mem_ctl_info *mci) -{ - int i; - - for (i = 0; i < mci->nr_csrows; i++) { - if (device_is_registered(&mci->csrows[i]->dev)) - device_unregister(&mci->csrows[i]->dev); - } -} - -#endif - /* * Per-dimm (or per-rank) devices */ @@ -989,12 +594,6 @@ int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, goto fail; } -#ifdef CONFIG_EDAC_LEGACY_SYSFS - err = edac_create_csrow_objects(mci); - if (err < 0) - goto fail; -#endif - edac_create_debugfs_nodes(mci); return 0; @@ -1019,9 +618,6 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) #ifdef CONFIG_EDAC_DEBUG edac_debugfs_remove_recursive(mci->debugfs); #endif -#ifdef CONFIG_EDAC_LEGACY_SYSFS - edac_delete_csrow_objects(mci); -#endif mci_for_each_dimm(mci, dimm) { if (!device_is_registered(&dimm->dev)) diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 1eb0136c6fbd..d80c88818691 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -15,6 +15,7 @@ #include "edac_module.h" #include <ras/ras_event.h> #include <linux/notifier.h> +#include <linux/string.h> #define OTHER_DETAIL_LEN 400 @@ -332,7 +333,7 @@ static int ghes_edac_report_mem_error(struct notifier_block *nb, p = pvt->msg; p += snprintf(p, sizeof(pvt->msg), "%s", cper_mem_err_type_str(etype)); } else { - strcpy(pvt->msg, "unknown error"); + strscpy(pvt->msg, "unknown error"); } /* Error address */ @@ -357,14 +358,14 @@ static int ghes_edac_report_mem_error(struct notifier_block *nb, dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle); if (dimm) { e->top_layer = dimm->idx; - strcpy(e->label, dimm->label); + strscpy(e->label, dimm->label); } } if (p > e->location) *(p - 1) = '\0'; if (!*e->label) - strcpy(e->label, "unknown memory"); + strscpy(e->label, "unknown memory"); /* All other fields are mapped on e->other_detail */ p = pvt->other_detail; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 2010a47149f4..89b3e8cc38b1 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -1198,7 +1198,8 @@ static int __init i10nm_init(void) d->imc[i].num_dimms = cfg->ddr_dimm_num; } - rc = skx_register_mci(&d->imc[i], d->imc[i].mdev, + rc = skx_register_mci(&d->imc[i], &d->imc[i].mdev->dev, + pci_name(d->imc[i].mdev), "Intel_10nm Socket", EDAC_MOD_STR, i10nm_get_dimm_config, cfg); if (rc < 0) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 5a080ab65476..8d4ddaa85ae8 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -526,6 +526,7 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in ie31200_pvt.priv[mc] = priv; return 0; fail_unmap: + put_device(&priv->dev); iounmap(window); fail_free: edac_mc_free(mci); @@ -598,6 +599,7 @@ static void ie31200_unregister_mcis(void) mci = priv->mci; edac_mc_del_mc(mci->pdev); iounmap(priv->window); + put_device(&priv->dev); edac_mc_free(mci); } } diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 2fc59f9eed69..553c31a2d922 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -1300,6 +1300,7 @@ static int igen6_register_mci(int mc, void __iomem *window, struct pci_dev *pdev imc->mci = mci; return 0; fail3: + put_device(&imc->dev); mci->pvt_info = NULL; kfree(mci->ctl_name); fail2: @@ -1326,6 +1327,7 @@ static void igen6_unregister_mcis(void) kfree(mci->ctl_name); mci->pvt_info = NULL; edac_mc_free(mci); + put_device(&imc->dev); iounmap(imc->window); } } diff --git a/drivers/edac/imh_base.c b/drivers/edac/imh_base.c new file mode 100644 index 000000000000..4348b3883b45 --- /dev/null +++ b/drivers/edac/imh_base.c @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for Intel(R) servers with Integrated Memory/IO Hub-based memory controller. + * Copyright (c) 2025, Intel Corporation. + */ + +#include <linux/kernel.h> +#include <linux/io.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/mce.h> +#include <asm/cpu.h> +#include "edac_module.h" +#include "skx_common.h" + +#define IMH_REVISION "v0.0.1" +#define EDAC_MOD_STR "imh_edac" + +/* Debug macros */ +#define imh_printk(level, fmt, arg...) \ + edac_printk(level, "imh", fmt, ##arg) + +/* Configuration Agent(Ubox) */ +#define MMIO_BASE_H(reg) (((u64)GET_BITFIELD(reg, 0, 29)) << 23) +#define SOCKET_ID(reg) GET_BITFIELD(reg, 0, 3) + +/* PUNIT */ +#define DDR_IMC_BITMAP(reg) GET_BITFIELD(reg, 23, 30) + +/* Memory Controller */ +#define ECC_ENABLED(reg) GET_BITFIELD(reg, 2, 2) +#define DIMM_POPULATED(reg) GET_BITFIELD(reg, 15, 15) + +/* System Cache Agent(SCA) */ +#define TOLM(reg) (((u64)GET_BITFIELD(reg, 16, 31)) << 16) +#define TOHM(reg) (((u64)GET_BITFIELD(reg, 16, 51)) << 16) + +/* Home Agent (HA) */ +#define NMCACHING(reg) GET_BITFIELD(reg, 8, 8) + +/** + * struct local_reg - A register as described in the local package view. + * + * @pkg: (input) The package where the register is located. + * @pbase: (input) The IP MMIO base physical address in the local package view. + * @size: (input) The IP MMIO size. + * @offset: (input) The register offset from the IP MMIO base @pbase. + * @width: (input) The register width in byte. + * @vbase: (internal) The IP MMIO base virtual address. + * @val: (output) The register value. + */ +struct local_reg { + int pkg; + u64 pbase; + u32 size; + u32 offset; + u8 width; + void __iomem *vbase; + u64 val; +}; + +#define DEFINE_LOCAL_REG(name, cfg, package, north, ip_name, ip_idx, reg_name) \ + struct local_reg name = { \ + .pkg = package, \ + .pbase = (north ? (cfg)->mmio_base_l_north : \ + (cfg)->mmio_base_l_south) + \ + (cfg)->ip_name##_base + \ + (cfg)->ip_name##_size * (ip_idx), \ + .size = (cfg)->ip_name##_size, \ + .offset = (cfg)->ip_name##_reg_##reg_name##_offset, \ + .width = (cfg)->ip_name##_reg_##reg_name##_width, \ + } + +static u64 readx(void __iomem *addr, u8 width) +{ + switch (width) { + case 1: + return readb(addr); + case 2: + return readw(addr); + case 4: + return readl(addr); + case 8: + return readq(addr); + default: + imh_printk(KERN_ERR, "Invalid reg 0x%p width %d\n", addr, width); + return 0; + } +} + +static void __read_local_reg(void *reg) +{ + struct local_reg *r = (struct local_reg *)reg; + + r->val = readx(r->vbase + r->offset, r->width); +} + +/* Read a local-view register. */ +static bool read_local_reg(struct local_reg *reg) +{ + int cpu; + + /* Get the target CPU in the package @reg->pkg. */ + for_each_online_cpu(cpu) { + if (reg->pkg == topology_physical_package_id(cpu)) + break; + } + + if (cpu >= nr_cpu_ids) + return false; + + reg->vbase = ioremap(reg->pbase, reg->size); + if (!reg->vbase) { + imh_printk(KERN_ERR, "Failed to ioremap 0x%llx\n", reg->pbase); + return false; + } + + /* Get the target CPU to read the register. */ + smp_call_function_single(cpu, __read_local_reg, reg, 1); + iounmap(reg->vbase); + + return true; +} + +/* Get the bitmap of memory controller instances in package @pkg. */ +static u32 get_imc_bitmap(struct res_config *cfg, int pkg, bool north) +{ + DEFINE_LOCAL_REG(reg, cfg, pkg, north, pcu, 0, capid3); + + if (!read_local_reg(®)) + return 0; + + edac_dbg(2, "Pkg%d %s mc instances bitmap 0x%llx (reg 0x%llx)\n", + pkg, north ? "north" : "south", + DDR_IMC_BITMAP(reg.val), reg.val); + + return DDR_IMC_BITMAP(reg.val); +} + +static void imc_release(struct device *dev) +{ + edac_dbg(2, "imc device %s released\n", dev_name(dev)); + kfree(dev); +} + +static int __get_ddr_munits(struct res_config *cfg, struct skx_dev *d, + bool north, int lmc) +{ + unsigned long size = cfg->ddr_chan_mmio_sz * cfg->ddr_chan_num; + unsigned long bitmap = get_imc_bitmap(cfg, d->pkg, north); + void __iomem *mbase; + struct device *dev; + int i, rc, pmc; + u64 base; + + for_each_set_bit(i, &bitmap, sizeof(bitmap) * 8) { + base = north ? d->mmio_base_h_north : d->mmio_base_h_south; + base += cfg->ddr_imc_base + size * i; + + edac_dbg(2, "Pkg%d mc%d mmio base 0x%llx size 0x%lx\n", + d->pkg, lmc, base, size); + + /* Set up the imc MMIO. */ + mbase = ioremap(base, size); + if (!mbase) { + imh_printk(KERN_ERR, "Failed to ioremap 0x%llx\n", base); + return -ENOMEM; + } + + d->imc[lmc].mbase = mbase; + d->imc[lmc].lmc = lmc; + + /* Create the imc device instance. */ + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->release = imc_release; + device_initialize(dev); + rc = dev_set_name(dev, "0x%llx", base); + if (rc) { + imh_printk(KERN_ERR, "Failed to set dev name\n"); + put_device(dev); + return rc; + } + + d->imc[lmc].dev = dev; + + /* Set up the imc index mapping. */ + pmc = north ? i : 8 + i; + skx_set_mc_mapping(d, pmc, lmc); + + lmc++; + } + + return lmc; +} + +static bool get_ddr_munits(struct res_config *cfg, struct skx_dev *d) +{ + int lmc = __get_ddr_munits(cfg, d, true, 0); + + if (lmc < 0) + return false; + + lmc = __get_ddr_munits(cfg, d, false, lmc); + if (lmc <= 0) + return false; + + return true; +} + +static bool get_socket_id(struct res_config *cfg, struct skx_dev *d) +{ + DEFINE_LOCAL_REG(reg, cfg, d->pkg, true, ubox, 0, socket_id); + u8 src_id; + int i; + + if (!read_local_reg(®)) + return false; + + src_id = SOCKET_ID(reg.val); + edac_dbg(2, "socket id 0x%x (reg 0x%llx)\n", src_id, reg.val); + + for (i = 0; i < cfg->ddr_imc_num; i++) + d->imc[i].src_id = src_id; + + return true; +} + +/* Get TOLM (Top Of Low Memory) and TOHM (Top Of High Memory) parameters. */ +static bool imh_get_tolm_tohm(struct res_config *cfg, u64 *tolm, u64 *tohm) +{ + DEFINE_LOCAL_REG(reg, cfg, 0, true, sca, 0, tolm); + + if (!read_local_reg(®)) + return false; + + *tolm = TOLM(reg.val); + edac_dbg(2, "tolm 0x%llx (reg 0x%llx)\n", *tolm, reg.val); + + DEFINE_LOCAL_REG(reg2, cfg, 0, true, sca, 0, tohm); + + if (!read_local_reg(®2)) + return false; + + *tohm = TOHM(reg2.val); + edac_dbg(2, "tohm 0x%llx (reg 0x%llx)\n", *tohm, reg2.val); + + return true; +} + +/* Get the system-view MMIO_BASE_H for {north,south}-IMH. */ +static int imh_get_all_mmio_base_h(struct res_config *cfg, struct list_head *edac_list) +{ + int i, n = topology_max_packages(), imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num; + struct skx_dev *d; + + for (i = 0; i < n; i++) { + d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL); + if (!d) + return -ENOMEM; + + DEFINE_LOCAL_REG(reg, cfg, i, true, ubox, 0, mmio_base); + + /* Get MMIO_BASE_H for the north-IMH. */ + if (!read_local_reg(®) || !reg.val) { + kfree(d); + imh_printk(KERN_ERR, "Pkg%d has no north mmio_base_h\n", i); + return -ENODEV; + } + + d->mmio_base_h_north = MMIO_BASE_H(reg.val); + edac_dbg(2, "Pkg%d north mmio_base_h 0x%llx (reg 0x%llx)\n", + i, d->mmio_base_h_north, reg.val); + + /* Get MMIO_BASE_H for the south-IMH (optional). */ + DEFINE_LOCAL_REG(reg2, cfg, i, false, ubox, 0, mmio_base); + + if (read_local_reg(®2)) { + d->mmio_base_h_south = MMIO_BASE_H(reg2.val); + edac_dbg(2, "Pkg%d south mmio_base_h 0x%llx (reg 0x%llx)\n", + i, d->mmio_base_h_south, reg2.val); + } + + d->pkg = i; + d->num_imc = imc_num; + skx_init_mc_mapping(d); + list_add_tail(&d->list, edac_list); + } + + return 0; +} + +/* Get the number of per-package memory controllers. */ +static int imh_get_imc_num(struct res_config *cfg) +{ + int imc_num = hweight32(get_imc_bitmap(cfg, 0, true)) + + hweight32(get_imc_bitmap(cfg, 0, false)); + + if (!imc_num) { + imh_printk(KERN_ERR, "Invalid mc number\n"); + return -ENODEV; + } + + if (cfg->ddr_imc_num != imc_num) { + /* + * Update the configuration data to reflect the number of + * present DDR memory controllers. + */ + cfg->ddr_imc_num = imc_num; + edac_dbg(2, "Set ddr mc number %d\n", imc_num); + } + + return 0; +} + +/* Get all memory controllers' parameters. */ +static int imh_get_munits(struct res_config *cfg, struct list_head *edac_list) +{ + struct skx_imc *imc; + struct skx_dev *d; + u8 mc = 0; + int i; + + list_for_each_entry(d, edac_list, list) { + if (!get_ddr_munits(cfg, d)) { + imh_printk(KERN_ERR, "No mc found\n"); + return -ENODEV; + } + + if (!get_socket_id(cfg, d)) { + imh_printk(KERN_ERR, "Failed to get socket id\n"); + return -ENODEV; + } + + for (i = 0; i < cfg->ddr_imc_num; i++) { + imc = &d->imc[i]; + if (!imc->mbase) + continue; + + imc->chan_mmio_sz = cfg->ddr_chan_mmio_sz; + imc->num_channels = cfg->ddr_chan_num; + imc->num_dimms = cfg->ddr_dimm_num; + imc->mc = mc++; + } + } + + return 0; +} + +static bool check_2lm_enabled(struct res_config *cfg, struct skx_dev *d, int ha_idx) +{ + DEFINE_LOCAL_REG(reg, cfg, d->pkg, true, ha, ha_idx, mode); + + if (!read_local_reg(®)) + return false; + + if (!NMCACHING(reg.val)) + return false; + + edac_dbg(2, "2-level memory configuration (reg 0x%llx, ha idx %d)\n", reg.val, ha_idx); + return true; +} + +/* Check whether the system has a 2-level memory configuration. */ +static bool imh_2lm_enabled(struct res_config *cfg, struct list_head *head) +{ + struct skx_dev *d; + int i; + + list_for_each_entry(d, head, list) { + for (i = 0; i < cfg->ddr_imc_num; i++) + if (check_2lm_enabled(cfg, d, i)) + return true; + } + + return false; +} + +/* Helpers to read memory controller registers */ +static u64 read_imc_reg(struct skx_imc *imc, int chan, u32 offset, u8 width) +{ + return readx(imc->mbase + imc->chan_mmio_sz * chan + offset, width); +} + +static u32 read_imc_mcmtr(struct res_config *cfg, struct skx_imc *imc, int chan) +{ + return (u32)read_imc_reg(imc, chan, cfg->ddr_reg_mcmtr_offset, cfg->ddr_reg_mcmtr_width); +} + +static u32 read_imc_dimmmtr(struct res_config *cfg, struct skx_imc *imc, int chan, int dimm) +{ + return (u32)read_imc_reg(imc, chan, cfg->ddr_reg_dimmmtr_offset + + cfg->ddr_reg_dimmmtr_width * dimm, + cfg->ddr_reg_dimmmtr_width); +} + +static bool ecc_enabled(u32 mcmtr) +{ + return (bool)ECC_ENABLED(mcmtr); +} + +static bool dimm_populated(u32 dimmmtr) +{ + return (bool)DIMM_POPULATED(dimmmtr); +} + +/* Get each DIMM's configurations of the memory controller @mci. */ +static int imh_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) +{ + struct skx_pvt *pvt = mci->pvt_info; + struct skx_imc *imc = pvt->imc; + struct dimm_info *dimm; + u32 mcmtr, dimmmtr; + int i, j, ndimms; + + for (i = 0; i < imc->num_channels; i++) { + if (!imc->mbase) + continue; + + mcmtr = read_imc_mcmtr(cfg, imc, i); + + for (ndimms = 0, j = 0; j < imc->num_dimms; j++) { + dimmmtr = read_imc_dimmmtr(cfg, imc, i, j); + edac_dbg(1, "mcmtr 0x%x dimmmtr 0x%x (mc%d ch%d dimm%d)\n", + mcmtr, dimmmtr, imc->mc, i, j); + + if (!dimm_populated(dimmmtr)) + continue; + + dimm = edac_get_dimm(mci, i, j, 0); + ndimms += skx_get_dimm_info(dimmmtr, 0, 0, dimm, + imc, i, j, cfg); + } + + if (ndimms && !ecc_enabled(mcmtr)) { + imh_printk(KERN_ERR, "ECC is disabled on mc%d ch%d\n", + imc->mc, i); + return -ENODEV; + } + } + + return 0; +} + +/* Register all memory controllers to the EDAC core. */ +static int imh_register_mci(struct res_config *cfg, struct list_head *edac_list) +{ + struct skx_imc *imc; + struct skx_dev *d; + int i, rc; + + list_for_each_entry(d, edac_list, list) { + for (i = 0; i < cfg->ddr_imc_num; i++) { + imc = &d->imc[i]; + if (!imc->mbase) + continue; + + rc = skx_register_mci(imc, imc->dev, + dev_name(imc->dev), + "Intel IMH-based Socket", + EDAC_MOD_STR, + imh_get_dimm_config, cfg); + if (rc) + return rc; + } + } + + return 0; +} + +static struct res_config dmr_cfg = { + .type = DMR, + .support_ddr5 = true, + .mmio_base_l_north = 0xf6800000, + .mmio_base_l_south = 0xf6000000, + .ddr_chan_num = 1, + .ddr_dimm_num = 2, + .ddr_imc_base = 0x39b000, + .ddr_chan_mmio_sz = 0x8000, + .ddr_reg_mcmtr_offset = 0x360, + .ddr_reg_mcmtr_width = 4, + .ddr_reg_dimmmtr_offset = 0x370, + .ddr_reg_dimmmtr_width = 4, + .ubox_base = 0x0, + .ubox_size = 0x2000, + .ubox_reg_mmio_base_offset = 0x580, + .ubox_reg_mmio_base_width = 4, + .ubox_reg_socket_id_offset = 0x1080, + .ubox_reg_socket_id_width = 4, + .pcu_base = 0x3000, + .pcu_size = 0x10000, + .pcu_reg_capid3_offset = 0x290, + .pcu_reg_capid3_width = 4, + .sca_base = 0x24c000, + .sca_size = 0x2500, + .sca_reg_tolm_offset = 0x2100, + .sca_reg_tolm_width = 8, + .sca_reg_tohm_offset = 0x2108, + .sca_reg_tohm_width = 8, + .ha_base = 0x3eb000, + .ha_size = 0x1000, + .ha_reg_mode_offset = 0x4a0, + .ha_reg_mode_width = 4, +}; + +static const struct x86_cpu_id imh_cpuids[] = { + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &dmr_cfg), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, imh_cpuids); + +static struct notifier_block imh_mce_dec = { + .notifier_call = skx_mce_check_error, + .priority = MCE_PRIO_EDAC, +}; + +static int __init imh_init(void) +{ + const struct x86_cpu_id *id; + struct list_head *edac_list; + struct res_config *cfg; + const char *owner; + u64 tolm, tohm; + int rc; + + edac_dbg(2, "\n"); + + if (ghes_get_devices()) + return -EBUSY; + + owner = edac_get_owner(); + if (owner && strncmp(owner, EDAC_MOD_STR, sizeof(EDAC_MOD_STR))) + return -EBUSY; + + if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return -ENODEV; + + id = x86_match_cpu(imh_cpuids); + if (!id) + return -ENODEV; + cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); + + if (!imh_get_tolm_tohm(cfg, &tolm, &tohm)) + return -ENODEV; + + skx_set_hi_lo(tolm, tohm); + + rc = imh_get_imc_num(cfg); + if (rc < 0) + goto fail; + + edac_list = skx_get_edac_list(); + + rc = imh_get_all_mmio_base_h(cfg, edac_list); + if (rc) + goto fail; + + rc = imh_get_munits(cfg, edac_list); + if (rc) + goto fail; + + skx_set_mem_cfg(imh_2lm_enabled(cfg, edac_list)); + + rc = imh_register_mci(cfg, edac_list); + if (rc) + goto fail; + + rc = skx_adxl_get(); + if (rc) + goto fail; + + opstate_init(); + mce_register_decode_chain(&imh_mce_dec); + skx_setup_debug("imh_test"); + + imh_printk(KERN_INFO, "%s\n", IMH_REVISION); + + return 0; +fail: + skx_remove(); + return rc; +} + +static void __exit imh_exit(void) +{ + edac_dbg(2, "\n"); + + skx_teardown_debug(); + mce_unregister_decode_chain(&imh_mce_dec); + skx_adxl_put(); + skx_remove(); +} + +module_init(imh_init); +module_exit(imh_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Qiuxu Zhuo"); +MODULE_DESCRIPTION("MC Driver for Intel servers using IMH-based memory controller"); diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 078ddf95cc6e..aa6593ccda2d 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -662,8 +662,8 @@ static int __init skx_init(void) d->imc[i].src_id = src_id; d->imc[i].num_channels = cfg->ddr_chan_num; d->imc[i].num_dimms = cfg->ddr_dimm_num; - - rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, + rc = skx_register_mci(&d->imc[i], &d->imc[i].chan[0].cdev->dev, + pci_name(d->imc[i].chan[0].cdev), "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); if (rc < 0) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index 724842f512ac..3276afe43922 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -124,7 +124,7 @@ void skx_adxl_put(void) } EXPORT_SYMBOL_GPL(skx_adxl_put); -static void skx_init_mc_mapping(struct skx_dev *d) +void skx_init_mc_mapping(struct skx_dev *d) { /* * By default, the BIOS presents all memory controllers within each @@ -135,6 +135,7 @@ static void skx_init_mc_mapping(struct skx_dev *d) for (int i = 0; i < d->num_imc; i++) d->imc[i].mc_mapping = i; } +EXPORT_SYMBOL_GPL(skx_init_mc_mapping); void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) { @@ -384,6 +385,12 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) } EXPORT_SYMBOL_GPL(skx_get_all_bus_mappings); +struct list_head *skx_get_edac_list(void) +{ + return &dev_edac_list; +} +EXPORT_SYMBOL_GPL(skx_get_edac_list); + int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm) { struct pci_dev *pdev; @@ -424,6 +431,13 @@ fail: } EXPORT_SYMBOL_GPL(skx_get_hi_lo); +void skx_set_hi_lo(u64 tolm, u64 tohm) +{ + skx_tolm = tolm; + skx_tohm = tohm; +} +EXPORT_SYMBOL_GPL(skx_set_hi_lo); + static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, int minval, int maxval, const char *name) { @@ -437,7 +451,7 @@ static int skx_get_dimm_attr(u32 reg, int lobit, int hibit, int add, } #define numrank(reg) skx_get_dimm_attr(reg, 12, 13, 0, 0, 2, "ranks") -#define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 6, "rows") +#define numrow(reg) skx_get_dimm_attr(reg, 2, 4, 12, 1, 7, "rows") #define numcol(reg) skx_get_dimm_attr(reg, 0, 1, 10, 0, 2, "cols") int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, @@ -545,9 +559,9 @@ unknown_size: } EXPORT_SYMBOL_GPL(skx_get_nvdimm_info); -int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, - const char *ctl_name, const char *mod_str, - get_dimm_config_f get_dimm_config, +int skx_register_mci(struct skx_imc *imc, struct device *dev, + const char *dev_name, const char *ctl_name, + const char *mod_str, get_dimm_config_f get_dimm_config, struct res_config *cfg) { struct mem_ctl_info *mci; @@ -588,7 +602,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = mod_str; - mci->dev_name = pci_name(pdev); + mci->dev_name = dev_name; mci->ctl_page_to_phys = NULL; rc = get_dimm_config(mci, cfg); @@ -596,7 +610,7 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, goto fail; /* Record ptr to the generic device */ - mci->pdev = &pdev->dev; + mci->pdev = dev; /* Add this new MC control structure to EDAC's list of MCs */ if (unlikely(edac_mc_add_mc(mci))) { @@ -810,6 +824,9 @@ void skx_remove(void) if (d->imc[i].mbase) iounmap(d->imc[i].mbase); + if (d->imc[i].dev) + put_device(d->imc[i].dev); + for (j = 0; j < d->imc[i].num_channels; j++) { if (d->imc[i].chan[j].cdev) pci_dev_put(d->imc[i].chan[j].cdev); @@ -833,7 +850,7 @@ EXPORT_SYMBOL_GPL(skx_remove); /* * Debug feature. * Exercise the address decode logic by writing an address to - * /sys/kernel/debug/edac/{skx,i10nm}_test/addr. + * /sys/kernel/debug/edac/{skx,i10nm,imh}_test/addr. */ static struct dentry *skx_test; diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index 73ba89786cdf..f88038e5b18c 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -121,20 +121,33 @@ struct reg_rrl { * memory controllers on the die. */ struct skx_dev { - struct list_head list; + /* {skx,i10nm}_edac */ u8 bus[4]; int seg; struct pci_dev *sad_all; struct pci_dev *util_all; - struct pci_dev *uracu; /* for i10nm CPU */ - struct pci_dev *pcu_cr3; /* for HBM memory detection */ + struct pci_dev *uracu; + struct pci_dev *pcu_cr3; u32 mcroute; + + /* imh_edac */ + /* System-view MMIO base physical addresses. */ + u64 mmio_base_h_north; + u64 mmio_base_h_south; + int pkg; + int num_imc; + struct list_head list; struct skx_imc { + /* i10nm_edac */ + struct pci_dev *mdev; + + /* imh_edac */ + struct device *dev; + struct mem_ctl_info *mci; - struct pci_dev *mdev; /* for i10nm CPU */ - void __iomem *mbase; /* for i10nm CPU */ - int chan_mmio_sz; /* for i10nm CPU */ + void __iomem *mbase; + int chan_mmio_sz; int num_channels; /* channels per memory controller */ int num_dimms; /* dimms per channel */ bool hbm_mc; @@ -178,7 +191,8 @@ enum type { SKX, I10NM, SPR, - GNR + GNR, + DMR, }; enum { @@ -237,10 +251,6 @@ struct pci_bdf { struct res_config { enum type type; - /* Configuration agent device ID */ - unsigned int decs_did; - /* Default bus number configuration register offset */ - int busno_cfg_offset; /* DDR memory controllers per socket */ int ddr_imc_num; /* DDR channels per DDR memory controller */ @@ -258,23 +268,57 @@ struct res_config { /* Per HBM channel memory-mapped I/O size */ int hbm_chan_mmio_sz; bool support_ddr5; - /* SAD device BDF */ - struct pci_bdf sad_all_bdf; - /* PCU device BDF */ - struct pci_bdf pcu_cr3_bdf; - /* UTIL device BDF */ - struct pci_bdf util_all_bdf; - /* URACU device BDF */ - struct pci_bdf uracu_bdf; - /* DDR mdev device BDF */ - struct pci_bdf ddr_mdev_bdf; - /* HBM mdev device BDF */ - struct pci_bdf hbm_mdev_bdf; - int sad_all_offset; /* RRL register sets per DDR channel */ struct reg_rrl *reg_rrl_ddr; /* RRL register sets per HBM channel */ struct reg_rrl *reg_rrl_hbm[2]; + union { + /* {skx,i10nm}_edac */ + struct { + /* Configuration agent device ID */ + unsigned int decs_did; + /* Default bus number configuration register offset */ + int busno_cfg_offset; + struct pci_bdf sad_all_bdf; + struct pci_bdf pcu_cr3_bdf; + struct pci_bdf util_all_bdf; + struct pci_bdf uracu_bdf; + struct pci_bdf ddr_mdev_bdf; + struct pci_bdf hbm_mdev_bdf; + int sad_all_offset; + }; + /* imh_edac */ + struct { + /* MMIO base physical address in local package view */ + u64 mmio_base_l_north; + u64 mmio_base_l_south; + u64 ddr_imc_base; + u64 ddr_reg_mcmtr_offset; + u8 ddr_reg_mcmtr_width; + u64 ddr_reg_dimmmtr_offset; + u8 ddr_reg_dimmmtr_width; + u64 ubox_base; + u32 ubox_size; + u32 ubox_reg_mmio_base_offset; + u8 ubox_reg_mmio_base_width; + u32 ubox_reg_socket_id_offset; + u8 ubox_reg_socket_id_width; + u64 pcu_base; + u32 pcu_size; + u32 pcu_reg_capid3_offset; + u8 pcu_reg_capid3_width; + u64 sca_base; + u32 sca_size; + u32 sca_reg_tolm_offset; + u8 sca_reg_tolm_width; + u32 sca_reg_tohm_offset; + u8 sca_reg_tohm_width; + u64 ha_base; + u32 ha_size; + u32 ha_reg_mode_offset; + u8 ha_reg_mode_width; + }; + }; }; typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci, @@ -287,13 +331,17 @@ void skx_adxl_put(void); void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log); void skx_set_mem_cfg(bool mem_cfg_2lm); void skx_set_res_cfg(struct res_config *cfg); +void skx_init_mc_mapping(struct skx_dev *d); void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc); int skx_get_src_id(struct skx_dev *d, int off, u8 *id); int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list); +struct list_head *skx_get_edac_list(void); + int skx_get_hi_lo(unsigned int did, int off[], u64 *tolm, u64 *tohm); +void skx_set_hi_lo(u64 tolm, u64 tohm); int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, @@ -302,7 +350,7 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm, int skx_get_nvdimm_info(struct dimm_info *dimm, struct skx_imc *imc, int chan, int dimmno, const char *mod_str); -int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, +int skx_register_mci(struct skx_imc *imc, struct device *dev, const char *dev_name, const char *ctl_name, const char *mod_str, get_dimm_config_f get_dimm_config, struct res_config *cfg); diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c index 7c5db8bf0595..1a1092793092 100644 --- a/drivers/edac/versalnet_edac.c +++ b/drivers/edac/versalnet_edac.c @@ -433,7 +433,7 @@ static void handle_error(struct mc_priv *priv, struct ecc_status *stat, phys_addr_t pfn; int err; - if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + if (WARN_ON_ONCE(ctl_num >= NUM_CONTROLLERS)) return; mci = priv->mci[ctl_num]; @@ -605,21 +605,23 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, length = result[MSG_ERR_LENGTH]; offset = result[MSG_ERR_OFFSET]; + /* + * The data can come in two stretches. Construct the regs from two + * messages. The offset indicates the offset from which the data is to + * be taken. + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (result[TOTAL_ERR_LENGTH] > length) { if (!mc_priv->part_len) mc_priv->part_len = length; else mc_priv->part_len += length; - /* - * The data can come in 2 stretches. Construct the regs from 2 - * messages the offset indicates the offset from which the data is to - * be taken - */ - for (i = 0 ; i < length; i++) { - k = offset + i; - j = ERROR_DATA + i; - mc_priv->regs[k] = result[j]; - } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) return 0; mc_priv->part_len = 0; @@ -705,7 +707,7 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, /* Convert to bytes */ length = result[TOTAL_ERR_LENGTH] * 4; log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, - sec_sev, (void *)&result[ERROR_DATA], length); + sec_sev, (void *)&mc_priv->regs, length); return 0; } diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index e5e0174a0335..66e1106db5e7 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -577,6 +577,8 @@ void fw_card_initialize(struct fw_card *card, INIT_LIST_HEAD(&card->transactions.list); spin_lock_init(&card->transactions.lock); + spin_lock_init(&card->topology_map.lock); + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 2f73bcd5696f..ed3ae8cdb0cd 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -441,12 +441,13 @@ static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_nod const u32 *self_ids, int self_id_count) { __be32 *map = buffer; + u32 next_generation = be32_to_cpu(buffer[1]) + 1; int node_count = (root_node_id & 0x3f) + 1; memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); + *map++ = cpu_to_be32(next_generation); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 94b05e4451dd..7d15a85d579f 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -11,12 +11,12 @@ cflags-y := $(KBUILD_CFLAGS) cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 \ +cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 -fms-extensions \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -fshort-wchar \ -Wno-pointer-sign \ $(call cc-disable-warning, address-of-packed-member) \ - $(call cc-disable-warning, gnu) \ + $(if $(CONFIG_CC_IS_CLANG),-Wno-gnu -Wno-microsoft-anon-tag) \ -fno-asynchronous-unwind-tables \ $(CLANG_FLAGS) diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c index f1c5fb45d5f7..c00d0ae7ed5d 100644 --- a/drivers/firmware/efi/libstub/x86-5lvl.c +++ b/drivers/firmware/efi/libstub/x86-5lvl.c @@ -66,7 +66,7 @@ void efi_5level_switch(void) bool have_la57 = native_read_cr4() & X86_CR4_LA57; bool need_toggle = want_la57 ^ have_la57; u64 *pgt = (void *)la57_toggle + PAGE_SIZE; - u64 *cr3 = (u64 *)__native_read_cr3(); + pgd_t *cr3 = (pgd_t *)native_read_cr3_pa(); u64 *new_cr3; if (!la57_toggle || !need_toggle) @@ -82,7 +82,7 @@ void efi_5level_switch(void) new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC; } else { /* take the new root table pointer from the current entry #0 */ - new_cr3 = (u64 *)(cr3[0] & PAGE_MASK); + new_cr3 = (u64 *)(native_pgd_val(cr3[0]) & PTE_PFN_MASK); /* copy the new root table if it is not 32-bit addressable */ if ((u64)new_cr3 > U32_MAX) diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index e3f990d888d7..00f58e27f6de 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -134,6 +134,7 @@ struct stratix10_svc_data { * @complete_status: state for completion * @svc_fifo_lock: protect access to service message data queue * @invoke_fn: function to issue secure monitor call or hypervisor call + * @svc: manages the list of client svc drivers * * This struct is used to create communication channels for service clients, to * handle secure monitor or hypervisor call. @@ -150,6 +151,7 @@ struct stratix10_svc_controller { struct completion complete_status; spinlock_t svc_fifo_lock; svc_invoke_fn *invoke_fn; + struct stratix10_svc *svc; }; /** @@ -1206,6 +1208,7 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) ret = -ENOMEM; goto err_free_kfifo; } + controller->svc = svc; svc->stratix10_svc_rsu = platform_device_alloc(STRATIX10_RSU, 0); if (!svc->stratix10_svc_rsu) { @@ -1237,8 +1240,6 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) if (ret) goto err_unregister_fcs_dev; - dev_set_drvdata(dev, svc); - pr_info("Intel Service Layer Driver Initialized\n"); return 0; @@ -1256,8 +1257,8 @@ err_destroy_pool: static void stratix10_svc_drv_remove(struct platform_device *pdev) { - struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev); struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev); + struct stratix10_svc *svc = ctrl->svc; of_platform_depopulate(ctrl->dev); diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 37600faf4a4b..416f265d09d0 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -723,6 +723,7 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, chip->get_multiple = gpio_fwd_get_multiple_locked; chip->set = gpio_fwd_set; chip->set_multiple = gpio_fwd_set_multiple_locked; + chip->set_config = gpio_fwd_set_config; chip->to_irq = gpio_fwd_to_irq; chip->base = -1; chip->ngpio = ngpios; diff --git a/drivers/gpio/gpio-tb10x.c b/drivers/gpio/gpio-tb10x.c index 09a448ce3eec..3c8fd322a713 100644 --- a/drivers/gpio/gpio-tb10x.c +++ b/drivers/gpio/gpio-tb10x.c @@ -50,25 +50,6 @@ static inline u32 tb10x_reg_read(struct tb10x_gpio *gpio, unsigned int offs) return ioread32(gpio->base + offs); } -static inline void tb10x_reg_write(struct tb10x_gpio *gpio, unsigned int offs, - u32 val) -{ - iowrite32(val, gpio->base + offs); -} - -static inline void tb10x_set_bits(struct tb10x_gpio *gpio, unsigned int offs, - u32 mask, u32 val) -{ - u32 r; - - guard(gpio_generic_lock_irqsave)(&gpio->chip); - - r = tb10x_reg_read(gpio, offs); - r = (r & ~mask) | (val & mask); - - tb10x_reg_write(gpio, offs, r); -} - static int tb10x_gpio_to_irq(struct gpio_chip *chip, unsigned offset) { struct tb10x_gpio *tb10x_gpio = gpiochip_get_data(chip); diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c index 175836467f21..084656564176 100644 --- a/drivers/gpio/gpiolib-cdev.c +++ b/drivers/gpio/gpiolib-cdev.c @@ -298,12 +298,13 @@ static const struct file_operations linehandle_fileops = { #endif }; +DEFINE_FREE(linehandle_free, struct linehandle_state *, if (!IS_ERR_OR_NULL(_T)) linehandle_free(_T)) + static int linehandle_create(struct gpio_device *gdev, void __user *ip) { struct gpiohandle_request handlereq; - struct linehandle_state *lh; - struct file *file; - int fd, i, ret; + struct linehandle_state *lh __free(linehandle_free) = NULL; + int i, ret; u32 lflags; if (copy_from_user(&handlereq, ip, sizeof(handlereq))) @@ -327,10 +328,8 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) lh->label = kstrndup(handlereq.consumer_label, sizeof(handlereq.consumer_label) - 1, GFP_KERNEL); - if (!lh->label) { - ret = -ENOMEM; - goto out_free_lh; - } + if (!lh->label) + return -ENOMEM; } lh->num_descs = handlereq.lines; @@ -340,20 +339,18 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) u32 offset = handlereq.lineoffsets[i]; struct gpio_desc *desc = gpio_device_get_desc(gdev, offset); - if (IS_ERR(desc)) { - ret = PTR_ERR(desc); - goto out_free_lh; - } + if (IS_ERR(desc)) + return PTR_ERR(desc); ret = gpiod_request_user(desc, lh->label); if (ret) - goto out_free_lh; + return ret; lh->descs[i] = desc; linehandle_flags_to_desc_flags(handlereq.flags, &desc->flags); ret = gpiod_set_transitory(desc, false); if (ret < 0) - goto out_free_lh; + return ret; /* * Lines have to be requested explicitly for input @@ -364,11 +361,11 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) ret = gpiod_direction_output_nonotify(desc, val); if (ret) - goto out_free_lh; + return ret; } else if (lflags & GPIOHANDLE_REQUEST_INPUT) { ret = gpiod_direction_input_nonotify(desc); if (ret) - goto out_free_lh; + return ret; } gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED); @@ -377,44 +374,23 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip) offset); } - fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); - if (fd < 0) { - ret = fd; - goto out_free_lh; - } - - file = anon_inode_getfile("gpio-linehandle", - &linehandle_fileops, - lh, - O_RDONLY | O_CLOEXEC); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto out_put_unused_fd; - } + FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC, + anon_inode_getfile("gpio-linehandle", &linehandle_fileops, + lh, O_RDONLY | O_CLOEXEC)); + if (fdf.err) + return fdf.err; + retain_and_null_ptr(lh); - handlereq.fd = fd; - if (copy_to_user(ip, &handlereq, sizeof(handlereq))) { - /* - * fput() will trigger the release() callback, so do not go onto - * the regular error cleanup path here. - */ - fput(file); - put_unused_fd(fd); + handlereq.fd = fd_prepare_fd(fdf); + if (copy_to_user(ip, &handlereq, sizeof(handlereq))) return -EFAULT; - } - fd_install(fd, file); + fd_publish(fdf); dev_dbg(&gdev->dev, "registered chardev handle for %d lines\n", lh->num_descs); return 0; - -out_put_unused_fd: - put_unused_fd(fd); -out_free_lh: - linehandle_free(lh); - return ret; } #endif /* CONFIG_GPIO_CDEV_V1 */ @@ -2548,10 +2524,17 @@ static int lineinfo_changed_notify(struct notifier_block *nb, container_of(nb, struct gpio_chardev_data, lineinfo_changed_nb); struct lineinfo_changed_ctx *ctx; struct gpio_desc *desc = data; + struct file *fp; if (!test_bit(gpio_chip_hwgpio(desc), cdev->watched_lines)) return NOTIFY_DONE; + /* Keep the file descriptor alive for the duration of the notification. */ + fp = get_file_active(&cdev->fp); + if (!fp) + /* Chardev file descriptor was or is being released. */ + return NOTIFY_DONE; + /* * If this is called from atomic context (for instance: with a spinlock * taken by the atomic notifier chain), any sleeping calls must be done @@ -2575,8 +2558,6 @@ static int lineinfo_changed_notify(struct notifier_block *nb, /* Keep the GPIO device alive until we emit the event. */ ctx->gdev = gpio_device_get(desc->gdev); ctx->cdev = cdev; - /* Keep the file descriptor alive too. */ - get_file(ctx->cdev->fp); INIT_WORK(&ctx->work, lineinfo_changed_func); queue_work(ctx->gdev->line_state_wq, &ctx->work); diff --git a/drivers/gpio/gpiolib-swnode.c b/drivers/gpio/gpiolib-swnode.c index f21dbc28cf2c..e3806db1c0e0 100644 --- a/drivers/gpio/gpiolib-swnode.c +++ b/drivers/gpio/gpiolib-swnode.c @@ -41,7 +41,7 @@ static struct gpio_device *swnode_get_gpio_device(struct fwnode_handle *fwnode) !strcmp(gdev_node->name, GPIOLIB_SWNODE_UNDEFINED_NAME)) return ERR_PTR(-ENOENT); - gdev = gpio_device_find_by_label(gdev_node->name); + gdev = gpio_device_find_by_fwnode(fwnode); return gdev ?: ERR_PTR(-EPROBE_DEFER); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 9952e412da50..cd8800ba5825 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -5296,6 +5296,8 @@ static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos) struct gpio_device *gdev; loff_t index = *pos; + s->private = NULL; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; @@ -5329,7 +5331,11 @@ static void *gpiolib_seq_next(struct seq_file *s, void *v, loff_t *pos) static void gpiolib_seq_stop(struct seq_file *s, void *v) { - struct gpiolib_seq_priv *priv = s->private; + struct gpiolib_seq_priv *priv; + + priv = s->private; + if (!priv) + return; srcu_read_unlock(&gpio_devices_srcu, priv->idx); kfree(priv); diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 4b2f7d794275..da2565e6de71 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -245,7 +245,7 @@ always-$(CONFIG_DRM_HEADER_TEST) += \ quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@) cmd_hdrtest = \ $(CC) $(c_flags) -fsyntax-only -x c /dev/null -include $< -include $<; \ - PYTHONDONTWRITEBYTECODE=1 $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \ + PYTHONDONTWRITEBYTECODE=1 $(PYTHON3) $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \ touch $@ $(obj)/%.hdrtest: $(src)/%.h FORCE diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index a2ca9acf8c4e..923f0fa7350c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1267,6 +1267,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem, (void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va); + /* VM entity stopped if process killed, don't clear freed pt bo */ + if (!amdgpu_vm_ready(vm)) + return 0; + (void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); (void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index f5d5c45ddc0d..afedea02188d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -236,7 +236,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip, r = amdgpu_xcp_select_scheds(adev, hw_ip, hw_prio, fpriv, &num_scheds, &scheds); if (r) - goto cleanup_entity; + goto error_free_entity; } /* disable load balance if the hw engine retains context among dependent jobs */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3d032c4e2dce..96b6738e6252 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2638,6 +2638,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) chip_name = "navi12"; break; case CHIP_CYAN_SKILLFISH: + if (adev->mman.discovery_bin) + return 0; chip_name = "cyan_skillfish"; break; } @@ -3414,10 +3416,11 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev, (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) continue; - /* skip CG for VCE/UVD, it's handled specially */ + /* skip CG for VCE/UVD/VPE, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && adev->ip_blocks[i].version->funcs->set_powergating_state) { /* enable powergating to save power */ @@ -5243,10 +5246,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); - r = amdgpu_dpm_notify_rlc_state(adev, false); - if (r) - return r; - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 8561ad7f6180..ed3bef1edfe4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -82,6 +82,18 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + /* + * Disable peer-to-peer access for DCC-enabled VRAM surfaces on GFX12+. + * Such buffers cannot be safely accessed over P2P due to device-local + * compression metadata. Fallback to system-memory path instead. + * Device supports GFX12 (GC 12.x or newer) + * BO was created with the AMDGPU_GEM_CREATE_GFX12_DCC flag + * + */ + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0) && + bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC) + attach->peer2peer = false; + if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 61268aa82df4..7333e19291cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2632,9 +2632,14 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; - if (amdgpu_acpi_should_gpu_reset(adev)) - return amdgpu_asic_reset(adev); + if (amdgpu_acpi_should_gpu_reset(adev)) { + amdgpu_device_lock_reset_domain(adev->reset_domain); + r = amdgpu_asic_reset(adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + return r; + } return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 9dcf51991b5b..869bceb0fe2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -597,6 +597,9 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev) /* reserve engine 5 for firmware */ if (adev->enable_mes) vm_inv_engs[i] &= ~(1 << 5); + /* reserve engine 6 for uni mes */ + if (adev->enable_uni_mes) + vm_inv_engs[i] &= ~(1 << 6); /* reserve mmhub engine 3 for firmware */ if (adev->enable_umsch_mm) vm_inv_engs[i] &= ~(1 << 3); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c index 9cddbf50442a..37270c4dab8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c @@ -280,6 +280,8 @@ int isp_kernel_buffer_alloc(struct device *dev, u64 size, if (ret) return ret; + /* Ensure *bo is NULL so a new BO will be created */ + *bo = NULL; ret = amdgpu_bo_create_kernel(adev, size, ISP_MC_ADDR_ALIGN, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 8c0e5d03de50..aa7987d0806c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2355,8 +2355,11 @@ static int psp_securedisplay_initialize(struct psp_context *psp) if (!ret && !psp->securedisplay_context.context.resp_status) { psp->securedisplay_context.context.initialized = true; mutex_init(&psp->securedisplay_context.mutex); - } else + } else { + /* don't try again */ + psp->securedisplay_context.context.bin_desc.size_bytes = 0; return ret; + } mutex_lock(&psp->securedisplay_context.mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index aa9ee5dffa45..9d568c16beb1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1372,7 +1372,7 @@ uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem) mem->mem_type == AMDGPU_PL_MMIO_REMAP)) { flags |= AMDGPU_PTE_SYSTEM; - if (ttm->caching == ttm_cached) + if (ttm && ttm->caching == ttm_cached) flags |= AMDGPU_PTE_SNOOPED; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 761bad98da3e..4d0096d0baa9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d { struct amdgpu_userq_fence *userq_fence, *tmp; struct dma_fence *fence; + unsigned long flags; u64 rptr; int i; if (!fence_drv) return; + spin_lock_irqsave(&fence_drv->fence_list_lock, flags); rptr = amdgpu_userq_fence_read(fence_drv); - spin_lock(&fence_drv->fence_list_lock); list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) { fence = &userq_fence->base; @@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d list_del(&userq_fence->link); dma_fence_put(fence); } - spin_unlock(&fence_drv->fence_list_lock); + spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); } void amdgpu_userq_fence_driver_destroy(struct kref *ref) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index c1a801203949..676e24fb8864 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1066,7 +1066,7 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, } /* Prepare a TLB flush fence to be attached to PTs */ - if (!params->unlocked && vm->is_compute_context) { + if (!params->unlocked) { amdgpu_vm_tlb_fence_create(params->adev, vm, fence); /* Makes sure no PD/PT is freed before the flush */ @@ -2078,7 +2078,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = before->bo_va->base.bo; amdgpu_vm_it_insert(before, &vm->va); - if (before->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (before->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && @@ -2093,7 +2093,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev, struct amdgpu_bo *bo = after->bo_va->base.bo; amdgpu_vm_it_insert(after, &vm->va); - if (after->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (after->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index 811124ff88a8..f9e2edf5260b 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -407,7 +407,8 @@ static int aqua_vanjaram_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, return -EINVAL; } - if (adev->kfd.init_complete && !amdgpu_in_reset(adev)) + if (adev->kfd.init_complete && !amdgpu_in_reset(adev) && + !adev->in_suspend) flags |= AMDGPU_XCP_OPS_KFD; if (flags & AMDGPU_XCP_OPS_KFD) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index d61eb9f187c6..f2be16e700c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -5872,9 +5872,9 @@ static void gfx_v11_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, if (flags & AMDGPU_IB_PREEMPTED) control |= INDIRECT_BUFFER_PRE_RESUME(1); - if (vmid) + if (vmid && !ring->adev->gfx.rs64_enable) gfx_v11_0_ring_emit_de_meta(ring, - (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false); + !amdgpu_sriov_vf(ring->adev) && (flags & AMDGPU_IB_PREEMPTED)); } amdgpu_ring_write(ring, header); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 7693b7953426..80565392313f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -3102,6 +3102,11 @@ static int gfx_v6_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 5976ed55d9db..2b7aba22ecc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -4399,6 +4399,11 @@ static int gfx_v7_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v7_0_gpu_early_init(adev); + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 0856ff65288c..8a81713d97aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -2023,6 +2023,11 @@ static int gfx_v8_0_sw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 77f9d5b9a556..c90cbe053ef3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2292,7 +2292,9 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev) r = amdgpu_xcp_init(adev->xcp_mgr, num_xcp, mode); } else { - if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, + if (adev->in_suspend) + amdgpu_xcp_restore_partition_mode(adev->xcp_mgr); + else if (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE) r = amdgpu_xcp_switch_partition_mode( diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c index baf097d2e1ac..ab0bf880d3d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c @@ -878,6 +878,7 @@ static const struct amdgpu_ring_funcs jpeg_v5_0_1_dec_ring_vm_funcs = { .get_rptr = jpeg_v5_0_1_dec_ring_get_rptr, .get_wptr = jpeg_v5_0_1_dec_ring_get_wptr, .set_wptr = jpeg_v5_0_1_dec_ring_set_wptr, + .parse_cs = amdgpu_jpeg_dec_parse_cs, .emit_frame_size = SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 + diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 64b240b51f1a..a9be7a505026 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -142,13 +142,37 @@ static int psp_v11_0_init_microcode(struct psp_context *psp) return err; } -static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) +static int psp_v11_wait_for_tos_unload(struct psp_context *psp) { struct amdgpu_device *adev = psp->adev; + uint32_t sol_reg1, sol_reg2; + int retry_loop; + /* Wait for the TOS to be unloaded */ + for (retry_loop = 0; retry_loop < 20; retry_loop++) { + sol_reg1 = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + usleep_range(1000, 2000); + sol_reg2 = RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81); + if (sol_reg1 == sol_reg2) + return 0; + } + dev_err(adev->dev, "TOS unload failed, C2PMSG_33: %x C2PMSG_81: %x", + RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_33), + RREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_81)); + + return -ETIME; +} + +static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) +{ + struct amdgpu_device *adev = psp->adev; int ret; int retry_loop; + /* For a reset done at the end of S3, only wait for TOS to be unloaded */ + if (adev->in_s3 && !(adev->flags & AMD_IS_APU) && amdgpu_in_reset(adev)) + return psp_v11_wait_for_tos_unload(psp); + for (retry_loop = 0; retry_loop < 20; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_35 set to 1 */ diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index eacf4e93ba2f..cb7123ec1a5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -141,7 +141,7 @@ static int vcn_v4_0_3_late_init(struct amdgpu_ip_block *ip_block) adev->vcn.supported_reset = amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]); - if (amdgpu_dpm_reset_vcn_is_supported(adev)) + if (amdgpu_dpm_reset_vcn_is_supported(adev) && !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 714350cabf2f..8bd457dea4cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -122,7 +122,9 @@ static int vcn_v5_0_1_late_init(struct amdgpu_ip_block *ip_block) switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { case IP_VERSION(13, 0, 12): - if ((adev->psp.sos.fw_version >= 0x00450025) && amdgpu_dpm_reset_vcn_is_supported(adev)) + if ((adev->psp.sos.fw_version >= 0x00450025) && + amdgpu_dpm_reset_vcn_is_supported(adev) && + !amdgpu_sriov_vf(adev)) adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index a65c67cf56ff..f1e7583650c4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -297,16 +297,16 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope goto out_err_unreserve; } - if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) { - pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n", + if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) { + pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n", properties->ctx_save_restore_area_size, topo_dev->node_props.cwsr_size); err = -EINVAL; goto out_err_unreserve; } - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address, @@ -352,8 +352,8 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope topo_dev = kfd_topology_device_by_id(pdd->dev->id); if (!topo_dev) return -EINVAL; - total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size) - * NUM_XCC(pdd->dev->xcc_mask); + total_cwsr_size = (properties->ctx_save_restore_area_size + + topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask); total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE); kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 9d72411c3379..74a1d3e1d52b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3687,6 +3687,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping); /* TODO: unmap ranges from GPU that lost access */ } + update_mapping |= !p->xnack_enabled && !list_empty(&remap_list); + list_for_each_entry_safe(prange, next, &remove_list, update_list) { pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index bfa3199591b6..7fe40bbba265 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3563,6 +3563,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) /* Do mst topology probing after resuming cached state*/ drm_connector_list_iter_begin(ddev, &iter); drm_for_each_connector_iter(connector, &iter) { + bool init = false; if (connector->connector_type == DRM_MODE_CONNECTOR_WRITEBACK) continue; @@ -3572,7 +3573,14 @@ static int dm_resume(struct amdgpu_ip_block *ip_block) aconnector->mst_root) continue; - drm_dp_mst_topology_queue_probe(&aconnector->mst_mgr); + scoped_guard(mutex, &aconnector->mst_mgr.lock) { + init = !aconnector->mst_mgr.mst_primary; + } + if (init) + dm_helpers_dp_mst_start_top_mgr(aconnector->dc_link->ctx, + aconnector->dc_link, false); + else + drm_dp_mst_topology_queue_probe(&aconnector->mst_mgr); } drm_connector_list_iter_end(&iter); @@ -3851,6 +3859,97 @@ void amdgpu_dm_update_connector_after_detect( update_subconnector_property(aconnector); } +static bool are_sinks_equal(const struct dc_sink *sink1, const struct dc_sink *sink2) +{ + if (!sink1 || !sink2) + return false; + if (sink1->sink_signal != sink2->sink_signal) + return false; + + if (sink1->dc_edid.length != sink2->dc_edid.length) + return false; + + if (memcmp(sink1->dc_edid.raw_edid, sink2->dc_edid.raw_edid, + sink1->dc_edid.length) != 0) + return false; + return true; +} + + +/** + * DOC: hdmi_hpd_debounce_work + * + * HDMI HPD debounce delay in milliseconds. When an HDMI display toggles HPD + * (such as during power save transitions), this delay determines how long to + * wait before processing the HPD event. This allows distinguishing between a + * physical unplug (>hdmi_hpd_debounce_delay) + * and a spontaneous RX HPD toggle (<hdmi_hpd_debounce_delay). + * + * If the toggle is less than this delay, the driver compares sink capabilities + * and permits a hotplug event if they changed. + * + * The default value of 1500ms was chosen based on experimental testing with + * various monitors that exhibit spontaneous HPD toggling behavior. + */ +static void hdmi_hpd_debounce_work(struct work_struct *work) +{ + struct amdgpu_dm_connector *aconnector = + container_of(to_delayed_work(work), struct amdgpu_dm_connector, + hdmi_hpd_debounce_work); + struct drm_connector *connector = &aconnector->base; + struct drm_device *dev = connector->dev; + struct amdgpu_device *adev = drm_to_adev(dev); + struct dc *dc = aconnector->dc_link->ctx->dc; + bool fake_reconnect = false; + bool reallow_idle = false; + bool ret = false; + guard(mutex)(&aconnector->hpd_lock); + + /* Re-detect the display */ + scoped_guard(mutex, &adev->dm.dc_lock) { + if (dc->caps.ips_support && dc->ctx->dmub_srv->idle_allowed) { + dc_allow_idle_optimizations(dc, false); + reallow_idle = true; + } + ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); + } + + if (ret) { + /* Apply workaround delay for certain panels */ + apply_delay_after_dpcd_poweroff(adev, aconnector->dc_sink); + /* Compare sinks to determine if this was a spontaneous HPD toggle */ + if (are_sinks_equal(aconnector->dc_link->local_sink, aconnector->hdmi_prev_sink)) { + /* + * Sinks match - this was a spontaneous HDMI HPD toggle. + */ + drm_dbg_kms(dev, "HDMI HPD: Sink unchanged after debounce, internal re-enable\n"); + fake_reconnect = true; + } + + /* Update connector state */ + amdgpu_dm_update_connector_after_detect(aconnector); + + drm_modeset_lock_all(dev); + dm_restore_drm_connector_state(dev, connector); + drm_modeset_unlock_all(dev); + + /* Only notify OS if sink actually changed */ + if (!fake_reconnect && aconnector->base.force == DRM_FORCE_UNSPECIFIED) + drm_kms_helper_hotplug_event(dev); + } + + /* Release the cached sink reference */ + if (aconnector->hdmi_prev_sink) { + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = NULL; + } + + scoped_guard(mutex, &adev->dm.dc_lock) { + if (reallow_idle && dc->caps.ips_support) + dc_allow_idle_optimizations(dc, true); + } +} + static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) { struct drm_connector *connector = &aconnector->base; @@ -3860,6 +3959,7 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) struct dm_connector_state *dm_con_state = to_dm_connector_state(connector->state); struct dc *dc = aconnector->dc_link->ctx->dc; bool ret = false; + bool debounce_required = false; if (adev->dm.disable_hpd_irq) return; @@ -3882,6 +3982,14 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) if (!dc_link_detect_connection_type(aconnector->dc_link, &new_connection_type)) drm_err(adev_to_drm(adev), "KMS: Failed to detect connector\n"); + /* + * Check for HDMI disconnect with debounce enabled. + */ + debounce_required = (aconnector->hdmi_hpd_debounce_delay_ms > 0 && + dc_is_hdmi_signal(aconnector->dc_link->connector_signal) && + new_connection_type == dc_connection_none && + aconnector->dc_link->local_sink != NULL); + if (aconnector->base.force && new_connection_type == dc_connection_none) { emulated_link_detect(aconnector->dc_link); @@ -3891,7 +3999,34 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector) if (aconnector->base.force == DRM_FORCE_UNSPECIFIED) drm_kms_helper_connector_hotplug_event(connector); + } else if (debounce_required) { + /* + * HDMI disconnect detected - schedule delayed work instead of + * processing immediately. This allows us to coalesce spurious + * HDMI signals from physical unplugs. + */ + drm_dbg_kms(dev, "HDMI HPD: Disconnect detected, scheduling debounce work (%u ms)\n", + aconnector->hdmi_hpd_debounce_delay_ms); + + /* Cache the current sink for later comparison */ + if (aconnector->hdmi_prev_sink) + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = aconnector->dc_link->local_sink; + if (aconnector->hdmi_prev_sink) + dc_sink_retain(aconnector->hdmi_prev_sink); + + /* Schedule delayed detection. */ + if (mod_delayed_work(system_wq, + &aconnector->hdmi_hpd_debounce_work, + msecs_to_jiffies(aconnector->hdmi_hpd_debounce_delay_ms))) + drm_dbg_kms(dev, "HDMI HPD: Re-scheduled debounce work\n"); + } else { + + /* If the aconnector->hdmi_hpd_debounce_work is scheduled, exit early */ + if (delayed_work_pending(&aconnector->hdmi_hpd_debounce_work)) + return; + scoped_guard(mutex, &adev->dm.dc_lock) { dc_exit_ips_for_hw_access(dc); ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD); @@ -4917,6 +5052,21 @@ static void amdgpu_dm_backlight_set_level(struct amdgpu_display_manager *dm, struct dc_link *link; u32 brightness; bool rc, reallow_idle = false; + struct drm_connector *connector; + + list_for_each_entry(connector, &dm->ddev->mode_config.connector_list, head) { + struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector); + + if (aconnector->bl_idx != bl_idx) + continue; + + /* if connector is off, save the brightness for next time it's on */ + if (!aconnector->base.encoder) { + dm->brightness[bl_idx] = user_brightness; + dm->actual_brightness[bl_idx] = 0; + return; + } + } amdgpu_dm_update_backlight_caps(dm, bl_idx); caps = &dm->backlight_caps[bl_idx]; @@ -7380,6 +7530,13 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector) if (aconnector->mst_mgr.dev) drm_dp_mst_topology_mgr_destroy(&aconnector->mst_mgr); + /* Cancel and flush any pending HDMI HPD debounce work */ + cancel_delayed_work_sync(&aconnector->hdmi_hpd_debounce_work); + if (aconnector->hdmi_prev_sink) { + dc_sink_release(aconnector->hdmi_prev_sink); + aconnector->hdmi_prev_sink = NULL; + } + if (aconnector->bl_idx != -1) { backlight_device_unregister(dm->backlight_dev[aconnector->bl_idx]); dm->backlight_dev[aconnector->bl_idx] = NULL; @@ -8030,7 +8187,7 @@ static int dm_encoder_helper_atomic_check(struct drm_encoder *encoder, "mode %dx%d@%dHz is not native, enabling scaling\n", adjusted_mode->hdisplay, adjusted_mode->vdisplay, drm_mode_vrefresh(adjusted_mode)); - dm_new_connector_state->scaling = RMX_FULL; + dm_new_connector_state->scaling = RMX_ASPECT; } return 0; } @@ -8541,6 +8698,10 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, mutex_init(&aconnector->hpd_lock); mutex_init(&aconnector->handle_mst_msg_ready); + aconnector->hdmi_hpd_debounce_delay_ms = AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS; + INIT_DELAYED_WORK(&aconnector->hdmi_hpd_debounce_work, hdmi_hpd_debounce_work); + aconnector->hdmi_prev_sink = NULL; + /* * configure support HPD hot plug connector_>polled default value is 0 * which means HPD hot plug not supported diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index db75e991ac7b..8ca738957598 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -59,6 +59,7 @@ #define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL) +#define AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS 1500 /* #include "include/amdgpu_dal_power_if.h" #include "amdgpu_dm_irq.h" @@ -819,6 +820,11 @@ struct amdgpu_dm_connector { bool pack_sdp_v1_3; enum adaptive_sync_type as_type; struct amdgpu_hdmi_vsdb_info vsdb_info; + + /* HDMI HPD debounce support */ + unsigned int hdmi_hpd_debounce_delay_ms; + struct delayed_work hdmi_hpd_debounce_work; + struct dc_sink *hdmi_prev_sink; }; static inline void amdgpu_dm_set_mst_status(uint8_t *status, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c index f263e1a4537e..00dac862b665 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c @@ -1302,7 +1302,8 @@ static int odm_combine_segments_show(struct seq_file *m, void *unused) if (connector->status != connector_status_connected) return -ENODEV; - if (pipe_ctx != NULL && pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments) + if (pipe_ctx && pipe_ctx->stream_res.tg && + pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments) pipe_ctx->stream_res.tg->funcs->get_odm_combine_segments(pipe_ctx->stream_res.tg, &segments); seq_printf(m, "%d\n", segments); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index cc21337a182f..d0f770dd0a95 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -997,8 +997,8 @@ enum dc_edid_status dm_helpers_read_local_edid( struct amdgpu_dm_connector *aconnector = link->priv; struct drm_connector *connector = &aconnector->base; struct i2c_adapter *ddc; - int retry = 3; - enum dc_edid_status edid_status; + int retry = 25; + enum dc_edid_status edid_status = EDID_NO_RESPONSE; const struct drm_edid *drm_edid; const struct edid *edid; @@ -1028,7 +1028,7 @@ enum dc_edid_status dm_helpers_read_local_edid( } if (!drm_edid) - return EDID_NO_RESPONSE; + continue; edid = drm_edid_raw(drm_edid); // FIXME: Get rid of drm_edid_raw() if (!edid || @@ -1046,7 +1046,7 @@ enum dc_edid_status dm_helpers_read_local_edid( &sink->dc_edid, &sink->edid_caps); - } while (edid_status == EDID_BAD_CHECKSUM && --retry > 0); + } while ((edid_status == EDID_BAD_CHECKSUM || edid_status == EDID_NO_RESPONSE) && --retry > 0); if (edid_status != EDID_OK) DRM_ERROR("EDID err: %d, on connector: %s", diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 5e92eaa67aa3..dbd1da4d85d3 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -884,26 +884,28 @@ struct dsc_mst_fairness_params { }; #if defined(CONFIG_DRM_AMD_DC_FP) -static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link) +static uint64_t kbps_to_pbn(int kbps, bool is_peak_pbn) { - u8 link_coding_cap; - uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B; + uint64_t effective_kbps = (uint64_t)kbps; - link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link); - if (link_coding_cap == DP_128b_132b_ENCODING) - fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B; + if (is_peak_pbn) { // add 0.6% (1006/1000) overhead into effective kbps + effective_kbps *= 1006; + effective_kbps = div_u64(effective_kbps, 1000); + } - return fec_overhead_multiplier_x1000; + return (uint64_t) DIV64_U64_ROUND_UP(effective_kbps * 64, (54 * 8 * 1000)); } -static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000) +static uint32_t pbn_to_kbps(unsigned int pbn, bool with_margin) { - u64 peak_kbps = kbps; + uint64_t pbn_effective = (uint64_t)pbn; + + if (with_margin) // deduct 0.6% (994/1000) overhead from effective pbn + pbn_effective *= (1000000 / PEAK_FACTOR_X1000); + else + pbn_effective *= 1000; - peak_kbps *= 1006; - peak_kbps *= fec_overhead_multiplier_x1000; - peak_kbps = div_u64(peak_kbps, 1000 * 1000); - return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000)); + return DIV_U64_ROUND_UP(pbn_effective * 8 * 54, 64); } static void set_dsc_configs_from_fairness_vars(struct dsc_mst_fairness_params *params, @@ -974,7 +976,7 @@ static int bpp_x16_from_pbn(struct dsc_mst_fairness_params param, int pbn) dc_dsc_get_default_config_option(param.sink->ctx->dc, &dsc_options); dsc_options.max_target_bpp_limit_override_x16 = drm_connector->display_info.max_dsc_bpp * 16; - kbps = div_u64((u64)pbn * 994 * 8 * 54, 64); + kbps = pbn_to_kbps(pbn, false); dc_dsc_compute_config( param.sink->ctx->dc->res_pool->dscs[0], ¶m.sink->dsc_caps.dsc_dec_caps, @@ -1003,12 +1005,11 @@ static int increase_dsc_bpp(struct drm_atomic_state *state, int link_timeslots_used; int fair_pbn_alloc; int ret = 0; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled) { initial_slack[i] = - kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn; + kbps_to_pbn(params[i].bw_range.max_kbps, false) - vars[i + k].pbn; bpp_increased[i] = false; remaining_to_increase += 1; } else { @@ -1104,7 +1105,6 @@ static int try_disable_dsc(struct drm_atomic_state *state, int next_index; int remaining_to_try = 0; int ret; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); int var_pbn; for (i = 0; i < count; i++) { @@ -1137,7 +1137,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC index #%d, try no compression\n", next_index); var_pbn = vars[next_index].pbn; - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[next_index].pbn = kbps_to_pbn(params[next_index].bw_range.stream_kbps, true); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -1197,7 +1197,6 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, int count = 0; int i, k, ret; bool debugfs_overwrite = false; - uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); struct drm_connector_state *new_conn_state; memset(params, 0, sizeof(params)); @@ -1278,7 +1277,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC Try no compression\n"); for (i = 0; i < count; i++) { vars[i + k].aconnector = params[i].aconnector; - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port, @@ -1300,7 +1299,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, DRM_DEBUG_DRIVER("MST_DSC Try max compression\n"); for (i = 0; i < count; i++) { if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.min_kbps, false); vars[i + k].dsc_enabled = true; vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1308,7 +1307,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (ret < 0) return ret; } else { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); + vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -1763,18 +1762,6 @@ clean_exit: return ret; } -static uint32_t kbps_from_pbn(unsigned int pbn) -{ - uint64_t kbps = (uint64_t)pbn; - - kbps *= (1000000 / PEAK_FACTOR_X1000); - kbps *= 8; - kbps *= 54; - kbps /= 64; - - return (uint32_t)kbps; -} - static bool is_dsc_common_config_possible(struct dc_stream_state *stream, struct dc_dsc_bw_range *bw_range) { @@ -1873,7 +1860,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( dc_link_get_highest_encoding_format(stream->link)); cur_link_settings = stream->link->verified_link_cap; root_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link, &cur_link_settings); - virtual_channel_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn); + virtual_channel_bw_in_kbps = pbn_to_kbps(aconnector->mst_output_port->full_pbn, true); /* pick the end to end bw bottleneck */ end_to_end_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps); @@ -1926,7 +1913,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( immediate_upstream_port = aconnector->mst_output_port->parent->port_parent; if (immediate_upstream_port) { - virtual_channel_bw_in_kbps = kbps_from_pbn(immediate_upstream_port->full_pbn); + virtual_channel_bw_in_kbps = pbn_to_kbps(immediate_upstream_port->full_pbn, true); virtual_channel_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps); } else { /* For topology LCT 1 case - only one mstb*/ diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index b11383fba35f..1eb04772f5da 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -394,6 +394,8 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base, display_count = dcn35_get_active_display_cnt_wa(dc, context, &all_active_disps); if (new_clocks->dtbclk_en && !new_clocks->ref_dtbclk_khz) new_clocks->ref_dtbclk_khz = 600000; + else if (!new_clocks->dtbclk_en && new_clocks->ref_dtbclk_khz > 590000) + new_clocks->ref_dtbclk_khz = 0; /* * if it is safe to lower, but we are already in the lower state, we don't have to do anything @@ -435,7 +437,7 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base, actual_dtbclk = REG_READ(CLK1_CLK4_CURRENT_CNT); - if (actual_dtbclk) { + if (actual_dtbclk > 590000) { clk_mgr_base->clks.ref_dtbclk_khz = new_clocks->ref_dtbclk_khz; clk_mgr_base->clks.dtbclk_en = new_clocks->dtbclk_en; } diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c index 9ac2d41f8fca..0a46e834357a 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c @@ -705,9 +705,14 @@ bool dc_stream_get_scanoutpos(const struct dc_stream_state *stream, { uint8_t i; bool ret = false; - struct dc *dc = stream->ctx->dc; - struct resource_context *res_ctx = - &dc->current_state->res_ctx; + struct dc *dc; + struct resource_context *res_ctx; + + if (!stream->ctx) + return false; + + dc = stream->ctx->dc; + res_ctx = &dc->current_state->res_ctx; dc_exit_ips_for_hw_access(dc); diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c index de6d62401362..c899c09ea31b 100644 --- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c +++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c @@ -1411,7 +1411,7 @@ static void dccg35_set_dtbclk_dto( __func__, params->otg_inst, params->pixclk_khz, params->ref_dtbclk_khz, req_dtbclk_khz, phase, modulo); - } else { + } else if (!params->ref_dtbclk_khz && !req_dtbclk_khz) { switch (params->otg_inst) { case 0: REG_UPDATE(DCCG_GATE_DISABLE_CNTL5, DTBCLK_P0_GATE_DISABLE, 0); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index 24184b4eb352..ebc220b29d14 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -671,7 +671,6 @@ void dce110_enable_stream(struct pipe_ctx *pipe_ctx) uint32_t early_control = 0; struct timing_generator *tg = pipe_ctx->stream_res.tg; - link_hwss->setup_stream_attribute(pipe_ctx); link_hwss->setup_stream_encoder(pipe_ctx); dc->hwss.update_info_frame(pipe_ctx); diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 9477c9f9e196..56c1ab6c7330 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -614,6 +614,14 @@ void dcn20_dpp_pg_control( * DOMAIN11_PGFSM_PWR_STATUS, pwr_status, * 1, 1000); */ + + /* Force disable cursor on plane powerdown on DPP 5 using dpp_force_disable_cursor */ + if (!power_on) { + struct dpp *dpp5 = hws->ctx->dc->res_pool->dpps[dpp_inst]; + if (dpp5 && dpp5->funcs->dpp_force_disable_cursor) + dpp5->funcs->dpp_force_disable_cursor(dpp5); + } + break; default: BREAK_TO_DEBUGGER(); @@ -3052,8 +3060,6 @@ void dcn20_enable_stream(struct pipe_ctx *pipe_ctx) link_enc->transmitter - TRANSMITTER_UNIPHY_A); } - link_hwss->setup_stream_attribute(pipe_ctx); - if (dc->res_pool->dccg->funcs->set_pixel_rate_div) dc->res_pool->dccg->funcs->set_pixel_rate_div( dc->res_pool->dccg, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c index ce3d0b45fb4c..68e48a2492c9 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c @@ -971,8 +971,6 @@ void dcn401_enable_stream(struct pipe_ctx *pipe_ctx) } } - link_hwss->setup_stream_attribute(pipe_ctx); - if (dc->res_pool->dccg->funcs->set_pixel_rate_div) { dc->res_pool->dccg->funcs->set_pixel_rate_div( dc->res_pool->dccg, diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c index 85303167a553..1173c53359b0 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c @@ -1141,6 +1141,7 @@ static bool detect_link_and_local_sink(struct dc_link *link, !sink->edid_caps.edid_hdmi) sink->sink_signal = SIGNAL_TYPE_DVI_SINGLE_LINK; else if (dc_is_dvi_signal(sink->sink_signal) && + dc_is_dvi_signal(link->connector_signal) && aud_support->hdmi_audio_native && sink->edid_caps.edid_hdmi) sink->sink_signal = SIGNAL_TYPE_HDMI_TYPE_A; diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c index 83419e1a9036..b66fbcb0040d 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c @@ -2458,6 +2458,7 @@ void link_set_dpms_on( struct link_encoder *link_enc = pipe_ctx->link_res.dio_link_enc; enum otg_out_mux_dest otg_out_dest = OUT_MUX_DIO; struct vpg *vpg = pipe_ctx->stream_res.stream_enc->vpg; + const struct link_hwss *link_hwss = get_link_hwss(link, &pipe_ctx->link_res); bool apply_edp_fast_boot_optimization = pipe_ctx->stream->apply_edp_fast_boot_optimization; @@ -2502,6 +2503,8 @@ void link_set_dpms_on( pipe_ctx->stream_res.tg->funcs->set_out_mux(pipe_ctx->stream_res.tg, otg_out_dest); } + link_hwss->setup_stream_attribute(pipe_ctx); + pipe_ctx->stream->apply_edp_fast_boot_optimization = false; // Enable VPG before building infoframe diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index b12c11bd6a14..eb262ce42e2d 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -1691,7 +1691,7 @@ static bool retrieve_link_cap(struct dc_link *link) union edp_configuration_cap edp_config_cap; union dp_downstream_port_present ds_port = { 0 }; enum dc_status status = DC_ERROR_UNEXPECTED; - uint32_t read_dpcd_retry_cnt = 3; + uint32_t read_dpcd_retry_cnt = 20; int i; struct dp_sink_hw_fw_revision dp_hw_fw_revision; const uint32_t post_oui_delay = 30; // 30ms @@ -1734,12 +1734,13 @@ static bool retrieve_link_cap(struct dc_link *link) } dpcd_set_source_specific_data(link); - /* Sink may need to configure internals based on vendor, so allow some - * time before proceeding with possibly vendor specific transactions - */ - msleep(post_oui_delay); for (i = 0; i < read_dpcd_retry_cnt; i++) { + /* + * Sink may need to configure internals based on vendor, so allow some + * time before proceeding with possibly vendor specific transactions + */ + msleep(post_oui_delay); status = core_link_read_dpcd( link, DP_DPCD_REV, diff --git a/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c index 6ffc74fc9dcd..ad088d70e189 100644 --- a/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c @@ -44,11 +44,6 @@ static void virtual_stream_encoder_dvi_set_stream_attribute( struct dc_crtc_timing *crtc_timing, bool is_dual_link) {} -static void virtual_stream_encoder_lvds_set_stream_attribute( - struct stream_encoder *enc, - struct dc_crtc_timing *crtc_timing) -{} - static void virtual_stream_encoder_set_throttled_vcp_size( struct stream_encoder *enc, struct fixed31_32 avg_time_slots_per_mtp) @@ -120,8 +115,6 @@ static const struct stream_encoder_funcs virtual_str_enc_funcs = { virtual_stream_encoder_hdmi_set_stream_attribute, .dvi_set_stream_attribute = virtual_stream_encoder_dvi_set_stream_attribute, - .lvds_set_stream_attribute = - virtual_stream_encoder_lvds_set_stream_attribute, .set_throttled_vcp_size = virtual_stream_encoder_set_throttled_vcp_size, .update_hdmi_info_packets = diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index ce421bcddcb0..1aae46d703ba 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -1260,6 +1260,17 @@ void mod_freesync_handle_v_update(struct mod_freesync *mod_freesync, update_v_total_for_static_ramp( core_freesync, stream, in_out_vrr); } + + /* + * If VRR is inactive, set vtotal min and max to nominal vtotal + */ + if (in_out_vrr->state == VRR_STATE_INACTIVE) { + in_out_vrr->adjust.v_total_min = + mod_freesync_calc_v_total_from_refresh(stream, + in_out_vrr->max_refresh_in_uhz); + in_out_vrr->adjust.v_total_max = in_out_vrr->adjust.v_total_min; + return; + } } unsigned long long mod_freesync_calc_nominal_field_rate( diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 518d07afc7df..bc29a923fa6e 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -195,24 +195,6 @@ int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev, return ret; } -int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en) -{ - int ret = 0; - const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; - - if (pp_funcs && pp_funcs->notify_rlc_state) { - mutex_lock(&adev->pm.mutex); - - ret = pp_funcs->notify_rlc_state( - adev->powerplay.pp_handle, - en); - - mutex_unlock(&adev->pm.mutex); - } - - return ret; -} - int amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev) { const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index b5fbb0fd1dc0..a7e6d7854b7b 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -4724,14 +4724,14 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) ret = devm_device_add_group(adev->dev, &amdgpu_pm_policy_attr_group); if (ret) - goto err_out0; + goto err_out1; } if (amdgpu_dpm_is_temp_metrics_supported(adev, SMU_TEMP_METRIC_GPUBOARD)) { ret = devm_device_add_group(adev->dev, &amdgpu_board_attr_group); if (ret) - goto err_out0; + goto err_out1; if (amdgpu_pm_get_sensor_generic(adev, AMDGPU_PP_SENSOR_MAXNODEPOWERLIMIT, (void *)&tmp) != -EOPNOTSUPP) { sysfs_add_file_to_group(&adev->dev->kobj, diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index 65c1d98af26c..af48aead12f7 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -424,8 +424,6 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev); int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev, enum pp_mp1_state mp1_state); -int amdgpu_dpm_notify_rlc_state(struct amdgpu_device *adev, bool en); - int amdgpu_dpm_set_gfx_power_up_by_imu(struct amdgpu_device *adev); int amdgpu_dpm_baco_exit(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index fb8086859857..244b8c364d45 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2040,6 +2040,12 @@ static int smu_disable_dpms(struct smu_context *smu) smu->is_apu && (amdgpu_in_reset(adev) || adev->in_s0ix)) return 0; + /* vangogh s0ix */ + if ((amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 5, 0) || + amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 5, 2)) && + adev->in_s0ix) + return 0; + /* * For gpu reset, runpm and hibernation through BACO, * BACO feature has to be kept enabled. diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 2c9869feba61..0708d0f0938b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -2217,6 +2217,9 @@ static int vangogh_post_smu_init(struct smu_context *smu) uint32_t total_cu = adev->gfx.config.max_cu_per_sh * adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines; + if (adev->in_s0ix) + return 0; + /* allow message will be sent after enable message on Vangogh*/ if (smu_cmn_feature_is_enabled(smu, SMU_FEATURE_DPM_GFXCLK_BIT) && (adev->pg_flags & AMD_PG_SUPPORT_GFX_PG)) { diff --git a/drivers/gpu/drm/bridge/sii902x.c b/drivers/gpu/drm/bridge/sii902x.c index d537b1d036fb..1f0aba28ad1e 100644 --- a/drivers/gpu/drm/bridge/sii902x.c +++ b/drivers/gpu/drm/bridge/sii902x.c @@ -179,7 +179,6 @@ struct sii902x { struct drm_connector connector; struct gpio_desc *reset_gpio; struct i2c_mux_core *i2cmux; - bool sink_is_hdmi; u32 bus_width; /* @@ -315,8 +314,6 @@ static int sii902x_get_modes(struct drm_connector *connector) drm_edid_free(drm_edid); } - sii902x->sink_is_hdmi = connector->display_info.is_hdmi; - return num; } @@ -342,9 +339,17 @@ static void sii902x_bridge_atomic_enable(struct drm_bridge *bridge, struct drm_atomic_state *state) { struct sii902x *sii902x = bridge_to_sii902x(bridge); + struct drm_connector *connector; + u8 output_mode = SII902X_SYS_CTRL_OUTPUT_DVI; + + connector = drm_atomic_get_new_connector_for_encoder(state, bridge->encoder); + if (connector && connector->display_info.is_hdmi) + output_mode = SII902X_SYS_CTRL_OUTPUT_HDMI; mutex_lock(&sii902x->mutex); + regmap_update_bits(sii902x->regmap, SII902X_SYS_CTRL_DATA, + SII902X_SYS_CTRL_OUTPUT_MODE, output_mode); regmap_update_bits(sii902x->regmap, SII902X_PWR_STATE_CTRL, SII902X_AVI_POWER_STATE_MSK, SII902X_AVI_POWER_STATE_D(0)); @@ -359,16 +364,12 @@ static void sii902x_bridge_mode_set(struct drm_bridge *bridge, const struct drm_display_mode *adj) { struct sii902x *sii902x = bridge_to_sii902x(bridge); - u8 output_mode = SII902X_SYS_CTRL_OUTPUT_DVI; struct regmap *regmap = sii902x->regmap; u8 buf[HDMI_INFOFRAME_SIZE(AVI)]; struct hdmi_avi_infoframe frame; u16 pixel_clock_10kHz = adj->clock / 10; int ret; - if (sii902x->sink_is_hdmi) - output_mode = SII902X_SYS_CTRL_OUTPUT_HDMI; - buf[0] = pixel_clock_10kHz & 0xff; buf[1] = pixel_clock_10kHz >> 8; buf[2] = drm_mode_vrefresh(adj); @@ -384,11 +385,6 @@ static void sii902x_bridge_mode_set(struct drm_bridge *bridge, mutex_lock(&sii902x->mutex); - ret = regmap_update_bits(sii902x->regmap, SII902X_SYS_CTRL_DATA, - SII902X_SYS_CTRL_OUTPUT_MODE, output_mode); - if (ret) - goto out; - ret = regmap_bulk_write(regmap, SII902X_TPI_VIDEO_DATA, buf, 10); if (ret) goto out; diff --git a/drivers/gpu/drm/clients/drm_client_setup.c b/drivers/gpu/drm/clients/drm_client_setup.c index 72480db1f00d..515aceac22b1 100644 --- a/drivers/gpu/drm/clients/drm_client_setup.c +++ b/drivers/gpu/drm/clients/drm_client_setup.c @@ -13,8 +13,8 @@ static char drm_client_default[16] = CONFIG_DRM_CLIENT_DEFAULT; module_param_string(active, drm_client_default, sizeof(drm_client_default), 0444); MODULE_PARM_DESC(active, - "Choose which drm client to start, default is" - CONFIG_DRM_CLIENT_DEFAULT "]"); + "Choose which drm client to start, default is " + CONFIG_DRM_CLIENT_DEFAULT); /** * drm_client_setup() - Setup in-kernel DRM clients diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 11a5b60cb9ce..0b3ee008523d 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -31,9 +31,7 @@ #include <linux/console.h> #include <linux/export.h> -#include <linux/pci.h> #include <linux/sysrq.h> -#include <linux/vga_switcheroo.h> #include <drm/drm_atomic.h> #include <drm/drm_drv.h> @@ -566,11 +564,6 @@ EXPORT_SYMBOL(drm_fb_helper_release_info); */ void drm_fb_helper_unregister_info(struct drm_fb_helper *fb_helper) { - struct fb_info *info = fb_helper->info; - struct device *dev = info->device; - - if (dev_is_pci(dev)) - vga_switcheroo_client_fb_set(to_pci_dev(dev), NULL); unregister_framebuffer(fb_helper->info); } EXPORT_SYMBOL(drm_fb_helper_unregister_info); @@ -1632,7 +1625,6 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper) struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; struct drm_fb_helper_surface_size sizes; - struct fb_info *info; int ret; if (drm_WARN_ON(dev, !dev->driver->fbdev_probe)) @@ -1653,12 +1645,6 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper) strcpy(fb_helper->fb->comm, "[fbcon]"); - info = fb_helper->info; - - /* Set the fb info for vgaswitcheroo clients. Does nothing otherwise. */ - if (dev_is_pci(info->device)) - vga_switcheroo_client_fb_set(to_pci_dev(info->device), info); - return 0; } diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c index 38f82391bfda..a30493ed9715 100644 --- a/drivers/gpu/drm/drm_plane.c +++ b/drivers/gpu/drm/drm_plane.c @@ -210,7 +210,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev, formats_size = sizeof(__u32) * plane->format_count; if (WARN_ON(!formats_size)) { /* 0 formats are never expected */ - return 0; + return ERR_PTR(-EINVAL); } modifiers_size = @@ -226,7 +226,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev, blob = drm_property_create_blob(dev, blob_size, NULL); if (IS_ERR(blob)) - return NULL; + return blob; blob_data = blob->data; blob_data->version = FORMAT_BLOB_CURRENT; diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index e58c0c158b3a..b91575380708 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -413,7 +413,7 @@ obj-$(CONFIG_DRM_I915_GVT_KVMGT) += kvmgt.o # # Enable locally for CONFIG_DRM_I915_WERROR=y. See also scripts/Makefile.build ifdef CONFIG_DRM_I915_WERROR - cmd_checkdoc = PYTHONDONTWRITEBYTECODE=1 $(KERNELDOC) -none -Werror $< + cmd_checkdoc = PYTHONDONTWRITEBYTECODE=1 $(PYTHON3) $(KERNELDOC) -none -Werror $< endif # header test diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c index 801235a5bc0a..a2d2cecf7121 100644 --- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c +++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c @@ -39,14 +39,12 @@ bool intel_encoder_is_c10phy(struct intel_encoder *encoder) struct intel_display *display = to_intel_display(encoder); enum phy phy = intel_encoder_to_phy(encoder); - /* PTL doesn't have a PHY connected to PORT B; as such, - * there will never be a case where PTL uses PHY B. - * WCL uses PORT A and B with the C10 PHY. - * Reusing the condition for WCL and extending it for PORT B - * should not cause any issues for PTL. - */ - if (display->platform.pantherlake && phy < PHY_C) - return true; + if (display->platform.pantherlake) { + if (display->platform.pantherlake_wildcatlake) + return phy <= PHY_B; + else + return phy == PHY_A; + } if ((display->platform.lunarlake || display->platform.meteorlake) && phy < PHY_C) return true; diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 5dca7f96b425..0d527cf22866 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -5964,6 +5964,14 @@ static int intel_async_flip_check_uapi(struct intel_atomic_state *state, return -EINVAL; } + /* FIXME: selective fetch should be disabled for async flips */ + if (new_crtc_state->enable_psr2_sel_fetch) { + drm_dbg_kms(display->drm, + "[CRTC:%d:%s] async flip disallowed with PSR2 selective fetch\n", + crtc->base.base.id, crtc->base.name); + return -EINVAL; + } + for_each_oldnew_intel_plane_in_state(state, plane, old_plane_state, new_plane_state, i) { if (plane->pipe != crtc->pipe) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.c b/drivers/gpu/drm/i915/display/intel_display_device.c index a002bc6ce7b0..f3f1f25b0f38 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.c +++ b/drivers/gpu/drm/i915/display/intel_display_device.c @@ -1404,8 +1404,20 @@ static const struct platform_desc bmg_desc = { PLATFORM_GROUP(dgfx), }; +static const u16 wcl_ids[] = { + INTEL_WCL_IDS(ID), + 0 +}; + static const struct platform_desc ptl_desc = { PLATFORM(pantherlake), + .subplatforms = (const struct subplatform_desc[]) { + { + SUBPLATFORM(pantherlake, wildcatlake), + .pciidlist = wcl_ids, + }, + {}, + } }; __diag_pop(); @@ -1482,6 +1494,7 @@ static const struct { INTEL_LNL_IDS(INTEL_DISPLAY_DEVICE, &lnl_desc), INTEL_BMG_IDS(INTEL_DISPLAY_DEVICE, &bmg_desc), INTEL_PTL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc), + INTEL_WCL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc), }; static const struct { diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h index f329f1beafef..a910642d589c 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.h +++ b/drivers/gpu/drm/i915/display/intel_display_device.h @@ -101,7 +101,9 @@ struct pci_dev; /* Display ver 14.1 (based on GMD ID) */ \ func(battlemage) \ /* Display ver 30 (based on GMD ID) */ \ - func(pantherlake) + func(pantherlake) \ + func(pantherlake_wildcatlake) + #define __MEMBER(name) unsigned long name:1; #define __COUNT(x) 1 + diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c index 4a4cace1f879..e1455fd7277f 100644 --- a/drivers/gpu/drm/i915/display/intel_dmc.c +++ b/drivers/gpu/drm/i915/display/intel_dmc.c @@ -127,6 +127,9 @@ static bool dmc_firmware_param_disabled(struct intel_display *display) #define DISPLAY_VER13_DMC_MAX_FW_SIZE 0x20000 #define DISPLAY_VER12_DMC_MAX_FW_SIZE ICL_DMC_MAX_FW_SIZE +#define XE3LPD_3002_DMC_PATH DMC_PATH(xe3lpd_3002) +MODULE_FIRMWARE(XE3LPD_3002_DMC_PATH); + #define XE3LPD_DMC_PATH DMC_PATH(xe3lpd) MODULE_FIRMWARE(XE3LPD_DMC_PATH); @@ -183,9 +186,10 @@ static const char *dmc_firmware_default(struct intel_display *display, u32 *size { const char *fw_path = NULL; u32 max_fw_size = 0; - - if (DISPLAY_VERx100(display) == 3002 || - DISPLAY_VERx100(display) == 3000) { + if (DISPLAY_VERx100(display) == 3002) { + fw_path = XE3LPD_3002_DMC_PATH; + max_fw_size = XE2LPD_DMC_MAX_FW_SIZE; + } else if (DISPLAY_VERx100(display) == 3000) { fw_path = XE3LPD_DMC_PATH; max_fw_size = XE2LPD_DMC_MAX_FW_SIZE; } else if (DISPLAY_VERx100(display) == 2000) { diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index 10eb93a34cf2..6d9c95e5c025 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -585,6 +585,10 @@ static void _panel_replay_init_dpcd(struct intel_dp *intel_dp) struct intel_display *display = to_intel_display(intel_dp); int ret; + /* TODO: Enable Panel Replay on MST once it's properly implemented. */ + if (intel_dp->mst_detect == DRM_DP_MST) + return; + ret = drm_dp_dpcd_read_data(&intel_dp->aux, DP_PANEL_REPLAY_CAP_SUPPORT, &intel_dp->pr_dpcd, sizeof(intel_dp->pr_dpcd)); if (ret < 0) @@ -888,7 +892,8 @@ static bool is_dc5_dc6_blocked(struct intel_dp *intel_dp) { struct intel_display *display = to_intel_display(intel_dp); u32 current_dc_state = intel_display_power_get_current_dc_state(display); - struct drm_vblank_crtc *vblank = &display->drm->vblank[intel_dp->psr.pipe]; + struct intel_crtc *crtc = intel_crtc_for_pipe(display, intel_dp->psr.pipe); + struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(&crtc->base); return (current_dc_state != DC_STATE_EN_UPTO_DC5 && current_dc_state != DC_STATE_EN_UPTO_DC6) || @@ -1251,12 +1256,6 @@ static bool intel_psr2_sel_fetch_config_valid(struct intel_dp *intel_dp, return false; } - if (crtc_state->uapi.async_flip) { - drm_dbg_kms(display->drm, - "PSR2 sel fetch not enabled, async flip enabled\n"); - return false; - } - return crtc_state->enable_psr2_sel_fetch = true; } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c index 88b147fa5cb1..c90b35881a26 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_clock_utils.c @@ -205,7 +205,7 @@ static u64 div_u64_roundup(u64 nom, u32 den) u64 intel_gt_clock_interval_to_ns(const struct intel_gt *gt, u64 count) { - return div_u64_roundup(count * NSEC_PER_SEC, gt->clock_frequency); + return mul_u64_u32_div(count, NSEC_PER_SEC, gt->clock_frequency); } u64 intel_gt_pm_interval_to_ns(const struct intel_gt *gt, u64 count) @@ -215,7 +215,7 @@ u64 intel_gt_pm_interval_to_ns(const struct intel_gt *gt, u64 count) u64 intel_gt_ns_to_clock_interval(const struct intel_gt *gt, u64 ns) { - return div_u64_roundup(gt->clock_frequency * ns, NSEC_PER_SEC); + return mul_u64_u32_div(ns, gt->clock_frequency, NSEC_PER_SEC); } u64 intel_gt_ns_to_pm_interval(const struct intel_gt *gt, u64 ns) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 25e97031d76e..30d5889fc809 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1595,8 +1595,20 @@ err_unlock: err_vma_res: i915_vma_resource_free(vma_res); err_fence: - if (work) - dma_fence_work_commit_imm(&work->base); + if (work) { + /* + * When pinning VMA to GGTT on CHV or BXT with VTD enabled, + * commit VMA binding asynchronously to avoid risk of lock + * inversion among reservation_ww locks held here and + * cpu_hotplug_lock acquired from stop_machine(), which we + * wrap around GGTT updates when running in those environments. + */ + if (i915_vma_is_ggtt(vma) && + intel_vm_no_concurrent_access_wa(vma->vm->i915)) + dma_fence_work_commit(&work->base); + else + dma_fence_work_commit_imm(&work->base); + } err_rpm: intel_runtime_pm_put(&vma->vm->i915->runtime_pm, wakeref); diff --git a/drivers/gpu/drm/imagination/Kconfig b/drivers/gpu/drm/imagination/Kconfig index 682dd2633d0c..0482bfcefdde 100644 --- a/drivers/gpu/drm/imagination/Kconfig +++ b/drivers/gpu/drm/imagination/Kconfig @@ -7,6 +7,7 @@ config DRM_POWERVR depends on DRM depends on MMU depends on PM + depends on POWER_SEQUENCING || !POWER_SEQUENCING select DRM_EXEC select DRM_GEM_SHMEM_HELPER select DRM_SCHED diff --git a/drivers/gpu/drm/imagination/pvr_device.h b/drivers/gpu/drm/imagination/pvr_device.h index ab8f56ae15df..ec53ff275541 100644 --- a/drivers/gpu/drm/imagination/pvr_device.h +++ b/drivers/gpu/drm/imagination/pvr_device.h @@ -146,6 +146,14 @@ struct pvr_device { */ struct clk *mem_clk; + /** + * @power: Optional power domain devices. + * + * On platforms with more than one power domain for the GPU, they are + * stored here in @domain_devs, along with links between them in + * @domain_links. The size of @domain_devs is given by @domain_count, + * while the size of @domain_links is (2 * @domain_count) - 1. + */ struct pvr_device_power { struct device **domain_devs; struct device_link **domain_links; diff --git a/drivers/gpu/drm/mediatek/mtk_crtc.c b/drivers/gpu/drm/mediatek/mtk_crtc.c index bc7527542fdc..c4c6d0249df5 100644 --- a/drivers/gpu/drm/mediatek/mtk_crtc.c +++ b/drivers/gpu/drm/mediatek/mtk_crtc.c @@ -283,6 +283,10 @@ static void ddp_cmdq_cb(struct mbox_client *cl, void *mssg) unsigned int i; unsigned long flags; + /* release GCE HW usage and start autosuspend */ + pm_runtime_mark_last_busy(cmdq_cl->chan->mbox->dev); + pm_runtime_put_autosuspend(cmdq_cl->chan->mbox->dev); + if (data->sta < 0) return; @@ -618,6 +622,9 @@ static void mtk_crtc_update_config(struct mtk_crtc *mtk_crtc, bool needs_vblank) mtk_crtc->config_updating = false; spin_unlock_irqrestore(&mtk_crtc->config_lock, flags); + if (pm_runtime_resume_and_get(mtk_crtc->cmdq_client.chan->mbox->dev) < 0) + goto update_config_out; + mbox_send_message(mtk_crtc->cmdq_client.chan, cmdq_handle); mbox_client_txdone(mtk_crtc->cmdq_client.chan, 0); goto update_config_out; diff --git a/drivers/gpu/drm/mediatek/mtk_plane.c b/drivers/gpu/drm/mediatek/mtk_plane.c index 02349bd44001..788b52c1d10c 100644 --- a/drivers/gpu/drm/mediatek/mtk_plane.c +++ b/drivers/gpu/drm/mediatek/mtk_plane.c @@ -21,9 +21,6 @@ static const u64 modifiers[] = { DRM_FORMAT_MOD_LINEAR, - DRM_FORMAT_MOD_ARM_AFBC(AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_SPLIT | - AFBC_FORMAT_MOD_SPARSE), DRM_FORMAT_MOD_INVALID, }; @@ -71,26 +68,7 @@ static bool mtk_plane_format_mod_supported(struct drm_plane *plane, uint32_t format, uint64_t modifier) { - if (modifier == DRM_FORMAT_MOD_LINEAR) - return true; - - if (modifier != DRM_FORMAT_MOD_ARM_AFBC( - AFBC_FORMAT_MOD_BLOCK_SIZE_32x8 | - AFBC_FORMAT_MOD_SPLIT | - AFBC_FORMAT_MOD_SPARSE)) - return false; - - if (format != DRM_FORMAT_XRGB8888 && - format != DRM_FORMAT_ARGB8888 && - format != DRM_FORMAT_BGRX8888 && - format != DRM_FORMAT_BGRA8888 && - format != DRM_FORMAT_ABGR8888 && - format != DRM_FORMAT_XBGR8888 && - format != DRM_FORMAT_RGB888 && - format != DRM_FORMAT_BGR888) - return false; - - return true; + return modifier == DRM_FORMAT_MOD_LINEAR; } static void mtk_plane_destroy_state(struct drm_plane *plane, diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index e97e39abf3a2..12b1dba8e05d 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -2867,7 +2867,9 @@ nv50_display_create(struct drm_device *dev) } /* Assign the correct format modifiers */ - if (disp->disp->object.oclass >= TU102_DISP) + if (disp->disp->object.oclass >= GB202_DISP) + nouveau_display(dev)->format_modifiers = wndwca7e_modifiers; + else if (disp->disp->object.oclass >= TU102_DISP) nouveau_display(dev)->format_modifiers = wndwc57e_modifiers; else if (drm->client.device.info.family >= NV_DEVICE_INFO_V0_FERMI) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.h b/drivers/gpu/drm/nouveau/dispnv50/disp.h index 15f9242b72ac..5d998f0319dc 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.h +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.h @@ -104,4 +104,5 @@ struct nouveau_encoder *nv50_real_outp(struct drm_encoder *encoder); extern const u64 disp50xx_modifiers[]; extern const u64 disp90xx_modifiers[]; extern const u64 wndwc57e_modifiers[]; +extern const u64 wndwca7e_modifiers[]; #endif diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index e2c55f4b9c5a..ef9e410babbf 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -786,13 +786,14 @@ nv50_wndw_destroy(struct drm_plane *plane) } /* This function assumes the format has already been validated against the plane - * and the modifier was validated against the device-wides modifier list at FB + * and the modifier was validated against the device-wide modifier list at FB * creation time. */ static bool nv50_plane_format_mod_supported(struct drm_plane *plane, u32 format, u64 modifier) { struct nouveau_drm *drm = nouveau_drm(plane->dev); + const struct drm_format_info *info = drm_format_info(format); uint8_t i; /* All chipsets can display all formats in linear layout */ @@ -800,13 +801,32 @@ static bool nv50_plane_format_mod_supported(struct drm_plane *plane, return true; if (drm->client.device.info.chipset < 0xc0) { - const struct drm_format_info *info = drm_format_info(format); const uint8_t kind = (modifier >> 12) & 0xff; if (!format) return false; for (i = 0; i < info->num_planes; i++) if ((info->cpp[i] != 4) && kind != 0x70) return false; + } else if (drm->client.device.info.chipset >= 0x1b2) { + const uint8_t slayout = ((modifier >> 22) & 0x1) | + ((modifier >> 25) & 0x6); + + if (!format) + return false; + + /* + * Note in practice this implies only formats where cpp is equal + * for each plane, or >= 4 for all planes, are supported. + */ + for (i = 0; i < info->num_planes; i++) { + if (((info->cpp[i] == 2) && slayout != 3) || + ((info->cpp[i] == 1) && slayout != 2) || + ((info->cpp[i] >= 4) && slayout != 1)) + return false; + + /* 24-bit not supported. It has yet another layout */ + WARN_ON(info->cpp[i] == 3); + } } return true; diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndwca7e.c b/drivers/gpu/drm/nouveau/dispnv50/wndwca7e.c index 0d8e9a9d1a57..2cec8cfbd546 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndwca7e.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndwca7e.c @@ -179,6 +179,39 @@ wndwca7e_ntfy_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) return 0; } +/**************************************************************** + * Log2(block height) ----------------------------+ * + * Page Kind ----------------------------------+ | * + * Gob Height/Page Kind Generation ------+ | | * + * Sector layout -------+ | | | * + * Compression ------+ | | | | */ +const u64 wndwca7e_modifiers[] = { /* | | | | | */ + /* 4cpp+ modifiers */ + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 0), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 1), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 2), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 3), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 4), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 1, 2, 0x06, 5), + /* 1cpp/8bpp modifiers */ + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 0), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 1), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 2), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 3), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 4), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 2, 2, 0x06, 5), + /* 2cpp/16bpp modifiers */ + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 0), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 1), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 2), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 3), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 4), + DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(0, 3, 2, 0x06, 5), + /* All formats support linear */ + DRM_FORMAT_MOD_LINEAR, + DRM_FORMAT_MOD_INVALID +}; + static const struct nv50_wndw_func wndwca7e = { .acquire = wndwc37e_acquire, diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c index cac6d64ab67d..4e8b3f1c7e25 100644 --- a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c +++ b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c @@ -159,6 +159,8 @@ nvkm_falcon_fw_dtor(struct nvkm_falcon_fw *fw) nvkm_memory_unref(&fw->inst); nvkm_falcon_fw_dtor_sigs(fw); nvkm_firmware_dtor(&fw->fw); + kfree(fw->boot); + fw->boot = NULL; } static const struct nvkm_firmware_func diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c index 156c7a0b62a2..3f43686f0195 100644 --- a/drivers/gpu/drm/panthor/panthor_gem.c +++ b/drivers/gpu/drm/panthor/panthor_gem.c @@ -288,6 +288,23 @@ panthor_gem_create_with_handle(struct drm_file *file, panthor_gem_debugfs_set_usage_flags(bo, 0); + /* If this is a write-combine mapping, we query the sgt to force a CPU + * cache flush (dma_map_sgtable() is called when the sgt is created). + * This ensures the zero-ing is visible to any uncached mapping created + * by vmap/mmap. + * FIXME: Ideally this should be done when pages are allocated, not at + * BO creation time. + */ + if (shmem->map_wc) { + struct sg_table *sgt; + + sgt = drm_gem_shmem_get_pages_sgt(shmem); + if (IS_ERR(sgt)) { + ret = PTR_ERR(sgt); + goto out_put_gem; + } + } + /* * Allocate an id of idr table where the obj is registered * and handle has the id what user can see. @@ -296,6 +313,7 @@ panthor_gem_create_with_handle(struct drm_file *file, if (!ret) *size = bo->base.base.size; +out_put_gem: /* drop reference from allocate - handle holds it now. */ drm_gem_object_put(&shmem->base); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 5b5b54e876d4..167d6f122b8e 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -360,13 +360,6 @@ static bool radeon_fence_is_signaled(struct dma_fence *f) if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) return true; - if (down_read_trylock(&rdev->exclusive_lock)) { - radeon_fence_process(rdev, ring); - up_read(&rdev->exclusive_lock); - - if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq) - return true; - } return false; } diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index c8e949f4a568..fe174a4857be 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -173,26 +173,15 @@ int drm_sched_entity_error(struct drm_sched_entity *entity) } EXPORT_SYMBOL(drm_sched_entity_error); +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, + struct dma_fence_cb *cb); + static void drm_sched_entity_kill_jobs_work(struct work_struct *wrk) { struct drm_sched_job *job = container_of(wrk, typeof(*job), work); - - drm_sched_fence_scheduled(job->s_fence, NULL); - drm_sched_fence_finished(job->s_fence, -ESRCH); - WARN_ON(job->s_fence->parent); - job->sched->ops->free_job(job); -} - -/* Signal the scheduler finished fence when the entity in question is killed. */ -static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, - struct dma_fence_cb *cb) -{ - struct drm_sched_job *job = container_of(cb, struct drm_sched_job, - finish_cb); + struct dma_fence *f; unsigned long index; - dma_fence_put(f); - /* Wait for all dependencies to avoid data corruptions */ xa_for_each(&job->dependencies, index, f) { struct drm_sched_fence *s_fence = to_drm_sched_fence(f); @@ -220,6 +209,21 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, dma_fence_put(f); } + drm_sched_fence_scheduled(job->s_fence, NULL); + drm_sched_fence_finished(job->s_fence, -ESRCH); + WARN_ON(job->s_fence->parent); + job->sched->ops->free_job(job); +} + +/* Signal the scheduler finished fence when the entity in question is killed. */ +static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, + struct dma_fence_cb *cb) +{ + struct drm_sched_job *job = container_of(cb, struct drm_sched_job, + finish_cb); + + dma_fence_put(f); + INIT_WORK(&job->work, drm_sched_entity_kill_jobs_work); schedule_work(&job->work); } diff --git a/drivers/gpu/drm/sti/sti_vtg.c b/drivers/gpu/drm/sti/sti_vtg.c index ee81691b3203..ce6bc7e7b135 100644 --- a/drivers/gpu/drm/sti/sti_vtg.c +++ b/drivers/gpu/drm/sti/sti_vtg.c @@ -143,12 +143,17 @@ struct sti_vtg { struct sti_vtg *of_vtg_find(struct device_node *np) { struct platform_device *pdev; + struct sti_vtg *vtg; pdev = of_find_device_by_node(np); if (!pdev) return NULL; - return (struct sti_vtg *)platform_get_drvdata(pdev); + vtg = platform_get_drvdata(pdev); + + put_device(&pdev->dev); + + return vtg; } static void vtg_reset(struct sti_vtg *vtg) diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 59d5c1ba145a..6c84bd69b11f 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -3148,6 +3148,7 @@ static int tegra_dc_couple(struct tegra_dc *dc) dc->client.parent = &parent->client; dev_dbg(dc->dev, "coupled to %s\n", dev_name(companion)); + put_device(companion); } return 0; diff --git a/drivers/gpu/drm/tegra/dsi.c b/drivers/gpu/drm/tegra/dsi.c index b5089b772267..ddfb2858acbf 100644 --- a/drivers/gpu/drm/tegra/dsi.c +++ b/drivers/gpu/drm/tegra/dsi.c @@ -913,15 +913,6 @@ static void tegra_dsi_encoder_enable(struct drm_encoder *encoder) u32 value; int err; - /* If the bootloader enabled DSI it needs to be disabled - * in order for the panel initialization commands to be - * properly sent. - */ - value = tegra_dsi_readl(dsi, DSI_POWER_CONTROL); - - if (value & DSI_POWER_CONTROL_ENABLE) - tegra_dsi_disable(dsi); - err = tegra_dsi_prepare(dsi); if (err < 0) { dev_err(dsi->dev, "failed to prepare: %d\n", err); diff --git a/drivers/gpu/drm/tegra/uapi.c b/drivers/gpu/drm/tegra/uapi.c index 5adab6b22916..d0b6a1fa6efa 100644 --- a/drivers/gpu/drm/tegra/uapi.c +++ b/drivers/gpu/drm/tegra/uapi.c @@ -114,9 +114,12 @@ int tegra_drm_ioctl_channel_open(struct drm_device *drm, void *data, struct drm_ if (err) goto put_channel; - if (supported) + if (supported) { + struct pid *pid = get_task_pid(current, PIDTYPE_TGID); context->memory_context = host1x_memory_context_alloc( - host, client->base.dev, get_task_pid(current, PIDTYPE_TGID)); + host, client->base.dev, pid); + put_pid(pid); + } if (IS_ERR(context->memory_context)) { if (PTR_ERR(context->memory_context) != -EOPNOTSUPP) { diff --git a/drivers/gpu/drm/tiny/Kconfig b/drivers/gpu/drm/tiny/Kconfig index 7d9e85e932d7..f0e72d4b6a47 100644 --- a/drivers/gpu/drm/tiny/Kconfig +++ b/drivers/gpu/drm/tiny/Kconfig @@ -85,6 +85,7 @@ config DRM_PANEL_MIPI_DBI config DRM_PIXPAPER tristate "DRM support for PIXPAPER display panels" depends on DRM && SPI + depends on MMU select DRM_CLIENT_SELECTION select DRM_GEM_SHMEM_HELPER select DRM_KMS_HELPER diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c index 718832b08d96..c46f17ba7236 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c @@ -100,8 +100,10 @@ vmw_cursor_update_type(struct vmw_private *vmw, struct vmw_plane_state *vps) if (vmw->has_mob) { if ((vmw->capabilities2 & SVGA_CAP2_CURSOR_MOB) != 0) return VMW_CURSOR_UPDATE_MOB; + else + return VMW_CURSOR_UPDATE_GB_ONLY; } - + drm_warn_once(&vmw->drm, "Unknown Cursor Type!\n"); return VMW_CURSOR_UPDATE_NONE; } @@ -139,6 +141,7 @@ static u32 vmw_cursor_mob_size(enum vmw_cursor_update_type update_type, { switch (update_type) { case VMW_CURSOR_UPDATE_LEGACY: + case VMW_CURSOR_UPDATE_GB_ONLY: case VMW_CURSOR_UPDATE_NONE: return 0; case VMW_CURSOR_UPDATE_MOB: @@ -623,6 +626,7 @@ int vmw_cursor_plane_prepare_fb(struct drm_plane *plane, if (!surface || vps->cursor.legacy.id == surface->snooper.id) vps->cursor.update_type = VMW_CURSOR_UPDATE_NONE; break; + case VMW_CURSOR_UPDATE_GB_ONLY: case VMW_CURSOR_UPDATE_MOB: { bo = vmw_user_object_buffer(&vps->uo); if (bo) { @@ -737,6 +741,7 @@ void vmw_cursor_plane_atomic_update(struct drm_plane *plane, struct drm_atomic_state *state) { + struct vmw_bo *bo; struct drm_plane_state *new_state = drm_atomic_get_new_plane_state(state, plane); struct drm_plane_state *old_state = @@ -762,6 +767,15 @@ vmw_cursor_plane_atomic_update(struct drm_plane *plane, case VMW_CURSOR_UPDATE_MOB: vmw_cursor_update_mob(dev_priv, vps); break; + case VMW_CURSOR_UPDATE_GB_ONLY: + bo = vmw_user_object_buffer(&vps->uo); + if (bo) + vmw_send_define_cursor_cmd(dev_priv, bo->map.virtual, + vps->base.crtc_w, + vps->base.crtc_h, + vps->base.hotspot_x, + vps->base.hotspot_y); + break; case VMW_CURSOR_UPDATE_NONE: /* do nothing */ break; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h index 40694925a70e..0c2cc0699b0d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h @@ -33,6 +33,7 @@ static const u32 __maybe_unused vmw_cursor_plane_formats[] = { enum vmw_cursor_update_type { VMW_CURSOR_UPDATE_NONE = 0, VMW_CURSOR_UPDATE_LEGACY, + VMW_CURSOR_UPDATE_GB_ONLY, VMW_CURSOR_UPDATE_MOB, }; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index d539f25b5fbe..3057f8baa7d2 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3668,6 +3668,11 @@ static int vmw_cmd_check(struct vmw_private *dev_priv, cmd_id = header->id; + if (header->size > SVGA_CMD_MAX_DATASIZE) { + VMW_DEBUG_USER("SVGA3D command: %d is too big.\n", + cmd_id + SVGA_3D_CMD_BASE); + return -E2BIG; + } *size = header->size + sizeof(SVGA3dCmdHeader); cmd_id -= SVGA_3D_CMD_BASE; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c index 7de20e56082c..fd4e76486f2d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c @@ -32,22 +32,22 @@ enum vmw_bo_dirty_method { /** * struct vmw_bo_dirty - Dirty information for buffer objects + * @ref_count: Reference count for this structure. Must be first member! * @start: First currently dirty bit * @end: Last currently dirty bit + 1 * @method: The currently used dirty method * @change_count: Number of consecutive method change triggers - * @ref_count: Reference count for this structure * @bitmap_size: The size of the bitmap in bits. Typically equal to the * nuber of pages in the bo. * @bitmap: A bitmap where each bit represents a page. A set bit means a * dirty page. */ struct vmw_bo_dirty { + struct kref ref_count; unsigned long start; unsigned long end; enum vmw_bo_dirty_method method; unsigned int change_count; - unsigned int ref_count; unsigned long bitmap_size; unsigned long bitmap[]; }; @@ -221,7 +221,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo) int ret; if (dirty) { - dirty->ref_count++; + kref_get(&dirty->ref_count); return 0; } @@ -235,7 +235,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo) dirty->bitmap_size = num_pages; dirty->start = dirty->bitmap_size; dirty->end = 0; - dirty->ref_count = 1; + kref_init(&dirty->ref_count); if (num_pages < PAGE_SIZE / sizeof(pte_t)) { dirty->method = VMW_BO_DIRTY_PAGETABLE; } else { @@ -274,10 +274,8 @@ void vmw_bo_dirty_release(struct vmw_bo *vbo) { struct vmw_bo_dirty *dirty = vbo->dirty; - if (dirty && --dirty->ref_count == 0) { - kvfree(dirty); + if (dirty && kref_put(&dirty->ref_count, (void *)kvfree)) vbo->dirty = NULL; - } } /** diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index 7219f6b884b6..4b288eb3f5b0 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -13,7 +13,6 @@ config DRM_XE select TMPFS select DRM_BUDDY select DRM_CLIENT_SELECTION - select DRM_EXEC select DRM_KMS_HELPER select DRM_KUNIT_TEST_HELPERS if DRM_XE_KUNIT_TEST != n select DRM_PANEL diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h index 51f2a03847f9..f680c8b8f258 100644 --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -168,6 +168,7 @@ #define XEHP_SLICE_COMMON_ECO_CHICKEN1 XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED) #define MSC_MSAA_REODER_BUF_BYPASS_DISABLE REG_BIT(14) +#define FAST_CLEAR_VALIGN_FIX REG_BIT(13) #define XE2LPM_CCCHKNREG1 XE_REG(0x82a8) diff --git a/drivers/gpu/drm/xe/tests/xe_mocs.c b/drivers/gpu/drm/xe/tests/xe_mocs.c index 0e502feaca81..6bb278167aaf 100644 --- a/drivers/gpu/drm/xe/tests/xe_mocs.c +++ b/drivers/gpu/drm/xe/tests/xe_mocs.c @@ -49,7 +49,7 @@ static void read_l3cc_table(struct xe_gt *gt, fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) { xe_force_wake_put(gt_to_fw(gt), fw_ref); - KUNIT_ASSERT_TRUE_MSG(test, true, "Forcewake Failed.\n"); + KUNIT_FAIL_AND_ABORT(test, "Forcewake Failed.\n"); } for (i = 0; i < info->num_mocs_regs; i++) { diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 34d33965eac2..456899238377 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -988,16 +988,16 @@ void xe_device_shutdown(struct xe_device *xe) drm_dbg(&xe->drm, "Shutting down device\n"); - if (xe_driver_flr_disabled(xe)) { - xe_display_pm_shutdown(xe); + xe_display_pm_shutdown(xe); - xe_irq_suspend(xe); + xe_irq_suspend(xe); - for_each_gt(gt, xe, id) - xe_gt_shutdown(gt); + for_each_gt(gt, xe, id) + xe_gt_shutdown(gt); - xe_display_pm_shutdown_late(xe); - } else { + xe_display_pm_shutdown_late(xe); + + if (!xe_driver_flr_disabled(xe)) { /* BOOM! */ __xe_driver_flr(xe); } diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 7715e74bb945..a8ab363a8046 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -165,7 +165,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) { err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs], - &syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC | + &syncs_user[num_syncs], NULL, 0, + SYNC_PARSE_FLAG_EXEC | (xe_vm_in_lr_mode(vm) ? SYNC_PARSE_FLAG_LR_MODE : 0)); if (err) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 37b2b93b73d6..cb5f204c08ed 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -10,6 +10,7 @@ #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> +#include <drm/drm_syncobj.h> #include <uapi/drm/xe_drm.h> #include "xe_dep_scheduler.h" @@ -324,6 +325,16 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe, } xe_vm_put(migrate_vm); + if (!IS_ERR(q)) { + int err = drm_syncobj_create(&q->ufence_syncobj, + DRM_SYNCOBJ_CREATE_SIGNALED, + NULL); + if (err) { + xe_exec_queue_put(q); + return ERR_PTR(err); + } + } + return q; } ALLOW_ERROR_INJECTION(xe_exec_queue_create_bind, ERRNO); @@ -333,6 +344,9 @@ void xe_exec_queue_destroy(struct kref *ref) struct xe_exec_queue *q = container_of(ref, struct xe_exec_queue, refcount); struct xe_exec_queue *eq, *next; + if (q->ufence_syncobj) + drm_syncobj_put(q->ufence_syncobj); + if (xe_exec_queue_uses_pxp(q)) xe_pxp_exec_queue_remove(gt_to_xe(q->gt)->pxp, q); diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 27b76cf9da89..df1c69dc81f1 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -15,6 +15,7 @@ #include "xe_hw_fence_types.h" #include "xe_lrc_types.h" +struct drm_syncobj; struct xe_execlist_exec_queue; struct xe_gt; struct xe_guc_exec_queue; @@ -155,6 +156,12 @@ struct xe_exec_queue { struct list_head link; } pxp; + /** @ufence_syncobj: User fence syncobj */ + struct drm_syncobj *ufence_syncobj; + + /** @ufence_timeline_value: User fence timeline value */ + u64 ufence_timeline_value; + /** @ops: submission backend exec queue operations */ const struct xe_exec_queue_ops *ops; diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c index 4f011d1573c6..f65d1edd0567 100644 --- a/drivers/gpu/drm/xe/xe_gt_clock.c +++ b/drivers/gpu/drm/xe/xe_gt_clock.c @@ -93,11 +93,6 @@ int xe_gt_clock_init(struct xe_gt *gt) return 0; } -static u64 div_u64_roundup(u64 n, u32 d) -{ - return div_u64(n + d - 1, d); -} - /** * xe_gt_clock_interval_to_ms - Convert sampled GT clock ticks to msec * @@ -108,5 +103,5 @@ static u64 div_u64_roundup(u64 n, u32 d) */ u64 xe_gt_clock_interval_to_ms(struct xe_gt *gt, u64 count) { - return div_u64_roundup(count * MSEC_PER_SEC, gt->info.reference_clock); + return mul_u64_u32_div(count, MSEC_PER_SEC, gt->info.reference_clock); } diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 18f6327bf552..b7afe8e983cb 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -200,6 +200,9 @@ static void guc_ct_fini(struct drm_device *drm, void *arg) { struct xe_guc_ct *ct = arg; +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG) + cancel_work_sync(&ct->dead.worker); +#endif ct_exit_safe_mode(ct); destroy_workqueue(ct->g2h_wq); xa_destroy(&ct->fence_lookup); @@ -223,6 +226,12 @@ int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct) xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE)); + err = drmm_mutex_init(&xe->drm, &ct->lock); + if (err) + return err; + + primelockdep(ct); + ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM); if (!ct->g2h_wq) return -ENOMEM; @@ -234,16 +243,13 @@ int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct) #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) spin_lock_init(&ct->dead.lock); INIT_WORK(&ct->dead.worker, ct_dead_worker_func); +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) + stack_depot_init(); +#endif #endif init_waitqueue_head(&ct->wq); init_waitqueue_head(&ct->g2h_fence_wq); - err = drmm_mutex_init(&xe->drm, &ct->lock); - if (err) - return err; - - primelockdep(ct); - err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 870edaf69388..06976cc77918 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -847,22 +847,6 @@ static int xe_irq_msix_init(struct xe_device *xe) return 0; } -static irqreturn_t guc2host_irq_handler(int irq, void *arg) -{ - struct xe_device *xe = arg; - struct xe_tile *tile; - u8 id; - - if (!atomic_read(&xe->irq.enabled)) - return IRQ_NONE; - - for_each_tile(tile, xe, id) - xe_guc_irq_handler(&tile->primary_gt->uc.guc, - GUC_INTR_GUC2HOST); - - return IRQ_HANDLED; -} - static irqreturn_t xe_irq_msix_default_hwe_handler(int irq, void *arg) { unsigned int tile_id, gt_id; @@ -979,7 +963,7 @@ int xe_irq_msix_request_irqs(struct xe_device *xe) u16 msix; msix = GUC2HOST_MSIX; - err = xe_irq_msix_request_irq(xe, guc2host_irq_handler, xe, + err = xe_irq_msix_request_irq(xe, xe_irq_handler(xe), xe, DRIVER_NAME "-guc2host", false, &msix); if (err) return err; diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index a4894eb0d7f3..125698a9ecf1 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -10,6 +10,7 @@ #include <drm/drm_drv.h> #include <drm/drm_managed.h> +#include <drm/drm_syncobj.h> #include <uapi/drm/xe_drm.h> #include <generated/xe_wa_oob.h> @@ -1389,7 +1390,9 @@ static int xe_oa_user_extensions(struct xe_oa *oa, enum xe_oa_user_extn_from fro return 0; } -static int xe_oa_parse_syncs(struct xe_oa *oa, struct xe_oa_open_param *param) +static int xe_oa_parse_syncs(struct xe_oa *oa, + struct xe_oa_stream *stream, + struct xe_oa_open_param *param) { int ret, num_syncs, num_ufence = 0; @@ -1409,7 +1412,9 @@ static int xe_oa_parse_syncs(struct xe_oa *oa, struct xe_oa_open_param *param) for (num_syncs = 0; num_syncs < param->num_syncs; num_syncs++) { ret = xe_sync_entry_parse(oa->xe, param->xef, ¶m->syncs[num_syncs], - ¶m->syncs_user[num_syncs], 0); + ¶m->syncs_user[num_syncs], + stream->ufence_syncobj, + ++stream->ufence_timeline_value, 0); if (ret) goto err_syncs; @@ -1539,7 +1544,7 @@ static long xe_oa_config_locked(struct xe_oa_stream *stream, u64 arg) return -ENODEV; param.xef = stream->xef; - err = xe_oa_parse_syncs(stream->oa, ¶m); + err = xe_oa_parse_syncs(stream->oa, stream, ¶m); if (err) goto err_config_put; @@ -1635,6 +1640,7 @@ static void xe_oa_destroy_locked(struct xe_oa_stream *stream) if (stream->exec_q) xe_exec_queue_put(stream->exec_q); + drm_syncobj_put(stream->ufence_syncobj); kfree(stream); } @@ -1826,6 +1832,7 @@ static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, struct xe_oa_open_param *param) { struct xe_oa_stream *stream; + struct drm_syncobj *ufence_syncobj; int stream_fd; int ret; @@ -1836,17 +1843,31 @@ static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, goto exit; } + ret = drm_syncobj_create(&ufence_syncobj, DRM_SYNCOBJ_CREATE_SIGNALED, + NULL); + if (ret) + goto exit; + stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (!stream) { ret = -ENOMEM; - goto exit; + goto err_syncobj; } - + stream->ufence_syncobj = ufence_syncobj; stream->oa = oa; - ret = xe_oa_stream_init(stream, param); + + ret = xe_oa_parse_syncs(oa, stream, param); if (ret) goto err_free; + ret = xe_oa_stream_init(stream, param); + if (ret) { + while (param->num_syncs--) + xe_sync_entry_cleanup(¶m->syncs[param->num_syncs]); + kfree(param->syncs); + goto err_free; + } + if (!param->disabled) { ret = xe_oa_enable_locked(stream); if (ret) @@ -1870,6 +1891,8 @@ err_destroy: xe_oa_stream_destroy(stream); err_free: kfree(stream); +err_syncobj: + drm_syncobj_put(ufence_syncobj); exit: return ret; } @@ -2083,22 +2106,14 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f goto err_exec_q; } - ret = xe_oa_parse_syncs(oa, ¶m); - if (ret) - goto err_exec_q; - mutex_lock(¶m.hwe->gt->oa.gt_lock); ret = xe_oa_stream_open_ioctl_locked(oa, ¶m); mutex_unlock(¶m.hwe->gt->oa.gt_lock); if (ret < 0) - goto err_sync_cleanup; + goto err_exec_q; return ret; -err_sync_cleanup: - while (param.num_syncs--) - xe_sync_entry_cleanup(¶m.syncs[param.num_syncs]); - kfree(param.syncs); err_exec_q: if (param.exec_q) xe_exec_queue_put(param.exec_q); diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h index 2628f78c4e8d..daf701b5d48b 100644 --- a/drivers/gpu/drm/xe/xe_oa_types.h +++ b/drivers/gpu/drm/xe/xe_oa_types.h @@ -15,6 +15,8 @@ #include "regs/xe_reg_defs.h" #include "xe_hw_engine_types.h" +struct drm_syncobj; + #define DEFAULT_XE_OA_BUFFER_SIZE SZ_16M enum xe_oa_report_header { @@ -248,6 +250,12 @@ struct xe_oa_stream { /** @xef: xe_file with which the stream was opened */ struct xe_file *xef; + /** @ufence_syncobj: User fence syncobj */ + struct drm_syncobj *ufence_syncobj; + + /** @ufence_timeline_value: User fence timeline value */ + u64 ufence_timeline_value; + /** @last_fence: fence to use in stream destroy when needed */ struct dma_fence *last_fence; diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 9a6df79fc5b6..89cc6d32f041 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -375,6 +375,7 @@ static const struct pci_device_id pciidlist[] = { INTEL_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc), INTEL_BMG_IDS(INTEL_VGA_DEVICE, &bmg_desc), INTEL_PTL_IDS(INTEL_VGA_DEVICE, &ptl_desc), + INTEL_WCL_IDS(INTEL_VGA_DEVICE, &ptl_desc), { } }; MODULE_DEVICE_TABLE(pci, pciidlist); diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c index 82872a51f098..d48ab7b32ca5 100644 --- a/drivers/gpu/drm/xe/xe_sync.c +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -113,6 +113,8 @@ static void user_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, struct xe_sync_entry *sync, struct drm_xe_sync __user *sync_user, + struct drm_syncobj *ufence_syncobj, + u64 ufence_timeline_value, unsigned int flags) { struct drm_xe_sync sync_in; @@ -192,10 +194,15 @@ int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, if (exec) { sync->addr = sync_in.addr; } else { + sync->ufence_timeline_value = ufence_timeline_value; sync->ufence = user_fence_create(xe, sync_in.addr, sync_in.timeline_value); if (XE_IOCTL_DBG(xe, IS_ERR(sync->ufence))) return PTR_ERR(sync->ufence); + sync->ufence_chain_fence = dma_fence_chain_alloc(); + if (!sync->ufence_chain_fence) + return -ENOMEM; + sync->ufence_syncobj = ufence_syncobj; } break; @@ -239,7 +246,12 @@ void xe_sync_entry_signal(struct xe_sync_entry *sync, struct dma_fence *fence) } else if (sync->ufence) { int err; - dma_fence_get(fence); + drm_syncobj_add_point(sync->ufence_syncobj, + sync->ufence_chain_fence, + fence, sync->ufence_timeline_value); + sync->ufence_chain_fence = NULL; + + fence = drm_syncobj_fence_get(sync->ufence_syncobj); user_fence_get(sync->ufence); err = dma_fence_add_callback(fence, &sync->ufence->cb, user_fence_cb); @@ -259,7 +271,8 @@ void xe_sync_entry_cleanup(struct xe_sync_entry *sync) drm_syncobj_put(sync->syncobj); dma_fence_put(sync->fence); dma_fence_chain_free(sync->chain_fence); - if (sync->ufence) + dma_fence_chain_free(sync->ufence_chain_fence); + if (!IS_ERR_OR_NULL(sync->ufence)) user_fence_put(sync->ufence); } diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h index 256ffc1e54dc..51f2d803e977 100644 --- a/drivers/gpu/drm/xe/xe_sync.h +++ b/drivers/gpu/drm/xe/xe_sync.h @@ -8,6 +8,7 @@ #include "xe_sync_types.h" +struct drm_syncobj; struct xe_device; struct xe_exec_queue; struct xe_file; @@ -21,6 +22,8 @@ struct xe_vm; int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, struct xe_sync_entry *sync, struct drm_xe_sync __user *sync_user, + struct drm_syncobj *ufence_syncobj, + u64 ufence_timeline_value, unsigned int flags); int xe_sync_entry_add_deps(struct xe_sync_entry *sync, struct xe_sched_job *job); diff --git a/drivers/gpu/drm/xe/xe_sync_types.h b/drivers/gpu/drm/xe/xe_sync_types.h index 30ac3f51993b..b88f1833e28c 100644 --- a/drivers/gpu/drm/xe/xe_sync_types.h +++ b/drivers/gpu/drm/xe/xe_sync_types.h @@ -18,9 +18,12 @@ struct xe_sync_entry { struct drm_syncobj *syncobj; struct dma_fence *fence; struct dma_fence_chain *chain_fence; + struct dma_fence_chain *ufence_chain_fence; + struct drm_syncobj *ufence_syncobj; struct xe_user_fence *ufence; u64 addr; u64 timeline_value; + u64 ufence_timeline_value; u32 type; u32 flags; }; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 63c65e3d207b..cdd1dc540a59 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3369,8 +3369,10 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm, op == DRM_XE_VM_BIND_OP_PREFETCH) || XE_IOCTL_DBG(xe, prefetch_region && op != DRM_XE_VM_BIND_OP_PREFETCH) || - XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC && - !(BIT(prefetch_region) & xe->info.mem_region_mask))) || + XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC && + /* Guard against undefined shift in BIT(prefetch_region) */ + (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) || + !(BIT(prefetch_region) & xe->info.mem_region_mask)))) || XE_IOCTL_DBG(xe, obj && op == DRM_XE_VM_BIND_OP_UNMAP) || XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) && @@ -3606,8 +3608,12 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) syncs_user = u64_to_user_ptr(args->syncs); for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) { + struct xe_exec_queue *__q = q ?: vm->q[0]; + err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs], &syncs_user[num_syncs], + __q->ufence_syncobj, + ++__q->ufence_timeline_value, (xe_vm_in_lr_mode(vm) ? SYNC_PARSE_FLAG_LR_MODE : 0) | (!args->num_binds ? diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index cd03891654a1..3cf30718b200 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -679,6 +679,8 @@ static const struct xe_rtp_entry_sr engine_was[] = { }, { XE_RTP_NAME("14023061436"), XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3001), + FUNC(xe_rtp_match_first_render_or_compute), OR, + GRAPHICS_VERSION_RANGE(3003, 3005), FUNC(xe_rtp_match_first_render_or_compute)), XE_RTP_ACTIONS(SET(TDL_CHICKEN, QID_WAIT_FOR_THREAD_NOT_RUN_DISABLE)) }, @@ -916,6 +918,15 @@ static const struct xe_rtp_entry_sr lrc_was[] = { XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3003), ENGINE_CLASS(RENDER)), XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE)) }, + { XE_RTP_NAME("14024681466"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3005), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(XEHP_SLICE_COMMON_ECO_CHICKEN1, FAST_CLEAR_VALIGN_FIX)) + }, + { XE_RTP_NAME("15016589081"), + XE_RTP_RULES(GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX)) + }, }; static __maybe_unused const struct xe_rtp_entry oob_was[] = { diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c index 0a9b44ce4904..b0bab2a1ddcc 100644 --- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c +++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c @@ -194,6 +194,8 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata) if (rc) goto cleanup; + mp2_ops->stop(privdata, cl_data->sensor_idx[i]); + amd_sfh_wait_for_response(privdata, cl_data->sensor_idx[i], DISABLE_SENSOR); writel(0, privdata->mmio + amd_get_p2c_val(privdata, 0)); mp2_ops->start(privdata, info); status = amd_sfh_wait_for_response diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 61404d7a43ee..57da4f86a9fa 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -355,6 +355,7 @@ static const struct apple_key_translation swapped_fn_leftctrl_keys[] = { static const struct apple_non_apple_keyboard non_apple_keyboards[] = { { "SONiX USB DEVICE" }, + { "SONiX AK870 PRO" }, { "Keychron" }, { "AONE" }, { "GANSS" }, diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index fee134a7eba3..5e9a5b8f7f16 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -553,9 +553,8 @@ static void corsair_void_add_battery(struct corsair_void_drvdata *drvdata) if (IS_ERR(new_supply)) { hid_err(drvdata->hid_dev, - "failed to register battery '%s' (reason: %ld)\n", - drvdata->battery_desc.name, - PTR_ERR(new_supply)); + "failed to register battery '%s' (reason: %pe)\n", + drvdata->battery_desc.name, new_supply); return; } diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c index 69771fd35006..981d1b6e9658 100644 --- a/drivers/hid/hid-elecom.c +++ b/drivers/hid/hid-elecom.c @@ -75,7 +75,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, */ mouse_button_fixup(hdev, rdesc, *rsize, 20, 28, 22, 14, 8); break; - case USB_DEVICE_ID_ELECOM_M_XT3URBK: + case USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB: + case USB_DEVICE_ID_ELECOM_M_XT3URBK_018F: case USB_DEVICE_ID_ELECOM_M_XT3DRBK: case USB_DEVICE_ID_ELECOM_M_XT4DRBK: /* @@ -119,7 +120,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc, static const struct hid_device_id elecom_devices[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, diff --git a/drivers/hid/hid-haptic.c b/drivers/hid/hid-haptic.c index aa090684c1f2..fc8a9997f815 100644 --- a/drivers/hid/hid-haptic.c +++ b/drivers/hid/hid-haptic.c @@ -86,7 +86,7 @@ int hid_haptic_input_configured(struct hid_device *hdev, if (hi->application == HID_DG_TOUCHPAD) { if (haptic->auto_trigger_report && haptic->manual_trigger_report) { - __set_bit(INPUT_PROP_HAPTIC_TOUCHPAD, hi->input->propbit); + __set_bit(INPUT_PROP_PRESSUREPAD, hi->input->propbit); return 1; } return 0; diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0723b4b1c9ec..c4589075a5ed 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -449,7 +449,8 @@ #define USB_VENDOR_ID_ELECOM 0x056e #define USB_DEVICE_ID_ELECOM_BM084 0x0061 #define USB_DEVICE_ID_ELECOM_M_XGL20DLBK 0x00e6 -#define USB_DEVICE_ID_ELECOM_M_XT3URBK 0x00fb +#define USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB 0x00fb +#define USB_DEVICE_ID_ELECOM_M_XT3URBK_018F 0x018f #define USB_DEVICE_ID_ELECOM_M_XT3DRBK 0x00fc #define USB_DEVICE_ID_ELECOM_M_XT4DRBK 0x00fd #define USB_DEVICE_ID_ELECOM_M_DT1URBK 0x00fe @@ -718,6 +719,7 @@ #define USB_DEVICE_ID_ITE_LENOVO_YOGA2 0x8350 #define I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720 0x837a #define USB_DEVICE_ID_ITE_LENOVO_YOGA900 0x8396 +#define I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD 0x8987 #define USB_DEVICE_ID_ITE8595 0x8595 #define USB_DEVICE_ID_ITE_MEDION_E1239T 0xce50 @@ -1543,7 +1545,7 @@ #define USB_VENDOR_ID_SIGNOTEC 0x2133 #define USB_DEVICE_ID_SIGNOTEC_VIEWSONIC_PD1011 0x0018 -#define USB_VENDOR_ID_SMARTLINKTECHNOLOGY 0x4c4a -#define USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155 0x4155 +#define USB_VENDOR_ID_JIELI_SDK_DEFAULT 0x4c4a +#define USB_DEVICE_ID_JIELI_SDK_4155 0x4155 #endif diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index e56e7de53279..2bbb645c2ff4 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -399,10 +399,11 @@ static const struct hid_device_id hid_battery_quirks[] = { { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_CHROMEBOOK_TROGDOR_POMPOM), HID_BATTERY_QUIRK_AVOID_QUERY }, /* - * Elan I2C-HID touchscreens seem to all report a non present battery, - * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C-HID devices. + * Elan HID touchscreens seem to all report a non present battery, + * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C and USB HID devices. */ { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE }, {} }; diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 654879814f97..9cc3e029e9f6 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -148,6 +148,14 @@ static const __u8 lenovo_tpIIbtkbd_need_fixup_collection[] = { 0x81, 0x01, /* Input (Const,Array,Abs,No Wrap,Linear,Preferred State,No Null Position) */ }; +static const __u8 lenovo_yoga7x_kbd_need_fixup_collection[] = { + 0x15, 0x00, // Logical Minimum (0) + 0x25, 0x65, // Logical Maximum (101) + 0x05, 0x07, // Usage Page (Keyboard) + 0x19, 0x00, // Usage Minimum (0) + 0x29, 0xDD, // Usage Maximum (221) +}; + static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { @@ -177,6 +185,13 @@ static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc, rdesc[260] = 0x01; /* report count (2) = 0x01 */ } break; + case I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD: + if (*rsize == 176 && + memcmp(&rdesc[52], lenovo_yoga7x_kbd_need_fixup_collection, + sizeof(lenovo_yoga7x_kbd_need_fixup_collection)) == 0) { + rdesc[55] = rdesc[61]; // logical maximum = usage maximum + } + break; } return rdesc; } @@ -1538,6 +1553,8 @@ static const struct hid_device_id lenovo_devices[] = { USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB2) }, + { HID_DEVICE(BUS_I2C, HID_GROUP_GENERIC, + USB_VENDOR_ID_ITE, I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD) }, { } }; diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c index 0f76e241e0af..a7f10c45f62b 100644 --- a/drivers/hid/hid-ntrig.c +++ b/drivers/hid/hid-ntrig.c @@ -142,13 +142,13 @@ static void ntrig_report_version(struct hid_device *hdev) int ret; char buf[20]; struct usb_device *usb_dev = hid_to_usb_dev(hdev); - unsigned char *data = kmalloc(8, GFP_KERNEL); + unsigned char *data __free(kfree) = kmalloc(8, GFP_KERNEL); if (!hid_is_usb(hdev)) return; if (!data) - goto err_free; + return; ret = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0), USB_REQ_CLEAR_FEATURE, @@ -163,9 +163,6 @@ static void ntrig_report_version(struct hid_device *hdev) hid_info(hdev, "Firmware version: %s (%02x%02x %02x%02x)\n", buf, data[2], data[3], data[4], data[5]); } - -err_free: - kfree(data); } static ssize_t show_phys_width(struct device *dev, diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index 63f6eb9030d1..128aa6abd10b 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -1942,6 +1942,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4) "Failed to retrieve DualShock4 calibration info: %d\n", ret); ret = -EILSEQ; + kfree(buf); goto transfer_failed; } else { break; @@ -1959,6 +1960,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4) if (ret) { hid_warn(hdev, "Failed to retrieve DualShock4 calibration info: %d\n", ret); + kfree(buf); goto transfer_failed; } } diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index bcd4bccf1a7c..c89a015686c0 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -410,7 +410,8 @@ static const struct hid_device_id hid_have_special_driver[] = { #if IS_ENABLED(CONFIG_HID_ELECOM) { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) }, - { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) }, + { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) }, @@ -915,7 +916,6 @@ static const struct hid_device_id hid_ignore_list[] = { #endif { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) }, - { HID_USB_DEVICE(USB_VENDOR_ID_SMARTLINKTECHNOLOGY, USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155) }, { } }; @@ -1064,6 +1064,18 @@ bool hid_ignore(struct hid_device *hdev) strlen(elan_acpi_id[i].id))) return true; break; + case USB_VENDOR_ID_JIELI_SDK_DEFAULT: + /* + * Multiple USB devices with identical IDs (mic & touchscreen). + * The touch screen requires hid core processing, but the + * microphone does not. They can be distinguished by manufacturer + * and serial number. + */ + if (hdev->product == USB_DEVICE_ID_JIELI_SDK_4155 && + strncmp(hdev->name, "SmartlinkTechnology", 19) == 0 && + strncmp(hdev->uniq, "20201111000001", 14) == 0) + return true; + break; } if (hdev->type == HID_TYPE_USBMOUSE && diff --git a/drivers/hid/hid-uclogic-params.c b/drivers/hid/hid-uclogic-params.c index ffa14a4621ef..4c4bac6f792b 100644 --- a/drivers/hid/hid-uclogic-params.c +++ b/drivers/hid/hid-uclogic-params.c @@ -1369,8 +1369,10 @@ static int uclogic_params_ugee_v2_init_event_hooks(struct hid_device *hdev, event_hook->hdev = hdev; event_hook->size = ARRAY_SIZE(reconnect_event); event_hook->event = kmemdup(reconnect_event, event_hook->size, GFP_KERNEL); - if (!event_hook->event) + if (!event_hook->event) { + kfree(event_hook); return -ENOMEM; + } list_add_tail(&event_hook->list, &p->event_hooks->list); diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c index edd61ef50e16..95377c5f6335 100644 --- a/drivers/hid/usbhid/hid-pidff.c +++ b/drivers/hid/usbhid/hid-pidff.c @@ -806,8 +806,8 @@ static int pidff_request_effect_upload(struct pidff_device *pidff, int efnum) static int pidff_needs_playback(struct pidff_device *pidff, int effect_id, int n) { - return pidff->effect[effect_id].is_infinite || - pidff->effect[effect_id].loop_count != n; + return !pidff->effect[effect_id].is_infinite || + pidff->effect[effect_id].loop_count != n; } /* diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..1d8d8d00e4e0 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -29,6 +29,7 @@ #include <linux/crash_dump.h> #include <linux/panic_notifier.h> #include <linux/vmalloc.h> +#include <linux/rseq.h> #include "mshv_eventfd.h" #include "mshv.h" @@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) } } while (!vp->run.flags.intercept_suspend); + rseq_virt_userspace_exit(); + return ret; } @@ -1870,8 +1873,6 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) struct hv_partition_creation_properties creation_properties = {}; union hv_partition_isolation_properties isolation_properties = {}; struct mshv_partition *partition; - struct file *file; - int fd; long ret; if (copy_from_user(&args, user_arg, sizeof(args))) @@ -1938,29 +1939,13 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) goto delete_partition; ret = mshv_init_async_handler(partition); - if (ret) - goto remove_partition; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - ret = fd; - goto remove_partition; - } - - file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, - partition, O_RDWR); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto put_fd; + if (!ret) { + ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", + &mshv_partition_fops, + partition, O_RDWR)); + if (ret >= 0) + return ret; } - - fd_install(fd, file); - - return fd; - -put_fd: - put_unused_fd(fd); -remove_partition: remove_partition(partition); delete_partition: hv_call_delete_partition(partition->pt_id); diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c index 321794807e8d..f81c3bc422f4 100644 --- a/drivers/hwmon/gpd-fan.c +++ b/drivers/hwmon/gpd-fan.c @@ -12,9 +12,9 @@ * Copyright (c) 2024 Cryolitia PukNgae */ -#include <linux/acpi.h> #include <linux/dmi.h> #include <linux/hwmon.h> +#include <linux/io.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/module.h> @@ -276,31 +276,6 @@ static int gpd_generic_read_rpm(void) return (u16)high << 8 | low; } -static void gpd_win4_init_ec(void) -{ - u8 chip_id, chip_ver; - - gpd_ecram_read(0x2000, &chip_id); - - if (chip_id == 0x55) { - gpd_ecram_read(0x1060, &chip_ver); - gpd_ecram_write(0x1060, chip_ver | 0x80); - } -} - -static int gpd_win4_read_rpm(void) -{ - int ret; - - ret = gpd_generic_read_rpm(); - - if (ret == 0) - // Re-init EC when speed is 0 - gpd_win4_init_ec(); - - return ret; -} - static int gpd_wm2_read_rpm(void) { for (u16 pwm_ctr_offset = GPD_PWM_CTR_OFFSET; @@ -320,11 +295,10 @@ static int gpd_wm2_read_rpm(void) static int gpd_read_rpm(void) { switch (gpd_driver_priv.drvdata->board) { + case win4_6800u: case win_mini: case duo: return gpd_generic_read_rpm(); - case win4_6800u: - return gpd_win4_read_rpm(); case win_max_2: return gpd_wm2_read_rpm(); } @@ -607,6 +581,28 @@ static struct hwmon_chip_info gpd_fan_chip_info = { .info = gpd_fan_hwmon_channel_info }; +static void gpd_win4_init_ec(void) +{ + u8 chip_id, chip_ver; + + gpd_ecram_read(0x2000, &chip_id); + + if (chip_id == 0x55) { + gpd_ecram_read(0x1060, &chip_ver); + gpd_ecram_write(0x1060, chip_ver | 0x80); + } +} + +static void gpd_init_ec(void) +{ + // The buggy firmware won't initialize EC properly on boot. + // Before its initialization, reading RPM will always return 0, + // and writing PWM will have no effect. + // Initialize it manually on driver load. + if (gpd_driver_priv.drvdata->board == win4_6800u) + gpd_win4_init_ec(); +} + static int gpd_fan_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -634,6 +630,8 @@ static int gpd_fan_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(hwdev), "Failed to register hwmon device\n"); + gpd_init_ec(); + return 0; } diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 43643d2c5bdd..9f64f463339d 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -1474,9 +1474,10 @@ static void arm_trbe_remove_cpuhp(struct trbe_drvdata *drvdata) static int arm_trbe_probe_irq(struct platform_device *pdev, struct trbe_drvdata *drvdata) { + const struct cpumask *affinity; int ret; - drvdata->irq = platform_get_irq(pdev, 0); + drvdata->irq = platform_get_irq_affinity(pdev, 0, &affinity); if (drvdata->irq < 0) { pr_err("IRQ not found for the platform device\n"); return drvdata->irq; @@ -1487,14 +1488,14 @@ static int arm_trbe_probe_irq(struct platform_device *pdev, return -EINVAL; } - if (irq_get_percpu_devid_partition(drvdata->irq, &drvdata->supported_cpus)) - return -EINVAL; + cpumask_copy(&drvdata->supported_cpus, affinity); drvdata->handle = alloc_percpu(struct perf_output_handle *); if (!drvdata->handle) return -ENOMEM; - ret = request_percpu_irq(drvdata->irq, arm_trbe_irq_handler, DRVNAME, drvdata->handle); + ret = request_percpu_irq_affinity(drvdata->irq, arm_trbe_irq_handler, DRVNAME, + affinity, drvdata->handle); if (ret) { free_percpu(drvdata->handle); return ret; diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index 75c8d08fa24e..b9f370c9f018 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -118,6 +118,7 @@ struct pca954x { raw_spinlock_t lock; struct regulator *supply; + struct gpio_desc *reset_gpio; struct reset_control *reset_cont; }; @@ -315,25 +316,6 @@ static u8 pca954x_regval(struct pca954x *data, u8 chan) return 1 << chan; } -static void pca954x_reset_assert(struct pca954x *data) -{ - if (data->reset_cont) - reset_control_assert(data->reset_cont); -} - -static void pca954x_reset_deassert(struct pca954x *data) -{ - if (data->reset_cont) - reset_control_deassert(data->reset_cont); -} - -static void pca954x_reset_mux(struct pca954x *data) -{ - pca954x_reset_assert(data); - udelay(1); - pca954x_reset_deassert(data); -} - static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) { struct pca954x *data = i2c_mux_priv(muxc); @@ -347,8 +329,6 @@ static int pca954x_select_chan(struct i2c_mux_core *muxc, u32 chan) ret = pca954x_reg_write(muxc->parent, client, regval); data->last_chan = ret < 0 ? 0 : regval; } - if (ret == -ETIMEDOUT && data->reset_cont) - pca954x_reset_mux(data); return ret; } @@ -358,7 +338,6 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) struct pca954x *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; s32 idle_state; - int ret = 0; idle_state = READ_ONCE(data->idle_state); if (idle_state >= 0) @@ -368,10 +347,8 @@ static int pca954x_deselect_mux(struct i2c_mux_core *muxc, u32 chan) if (idle_state == MUX_IDLE_DISCONNECT) { /* Deselect active channel */ data->last_chan = 0; - ret = pca954x_reg_write(muxc->parent, client, - data->last_chan); - if (ret == -ETIMEDOUT && data->reset_cont) - pca954x_reset_mux(data); + return pca954x_reg_write(muxc->parent, client, + data->last_chan); } /* otherwise leave as-is */ @@ -550,10 +527,29 @@ static int pca954x_get_reset(struct device *dev, struct pca954x *data) if (IS_ERR(data->reset_cont)) return dev_err_probe(dev, PTR_ERR(data->reset_cont), "Failed to get reset\n"); + else if (data->reset_cont) + return 0; + + /* + * fallback to legacy reset-gpios + */ + data->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(data->reset_gpio)) { + return dev_err_probe(dev, PTR_ERR(data->reset_gpio), + "Failed to get reset gpio"); + } return 0; } +static void pca954x_reset_deassert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_deassert(data->reset_cont); + else + gpiod_set_value_cansleep(data->reset_gpio, 0); +} + /* * I2C init/probing/exit functions */ @@ -593,7 +589,7 @@ static int pca954x_probe(struct i2c_client *client) if (ret) goto fail_cleanup; - if (data->reset_cont) { + if (data->reset_cont || data->reset_gpio) { udelay(1); pca954x_reset_deassert(data); /* Give the chip some time to recover. */ diff --git a/drivers/iio/accel/adxl355_core.c b/drivers/iio/accel/adxl355_core.c index 2e00fd51b4d5..5fc7f814b907 100644 --- a/drivers/iio/accel/adxl355_core.c +++ b/drivers/iio/accel/adxl355_core.c @@ -56,6 +56,8 @@ #define ADXL355_POWER_CTL_DRDY_MSK BIT(2) #define ADXL355_SELF_TEST_REG 0x2E #define ADXL355_RESET_REG 0x2F +#define ADXL355_BASE_ADDR_SHADOW_REG 0x50 +#define ADXL355_SHADOW_REG_COUNT 5 #define ADXL355_DEVID_AD_VAL 0xAD #define ADXL355_DEVID_MST_VAL 0x1D @@ -294,7 +296,12 @@ static void adxl355_fill_3db_frequency_table(struct adxl355_data *data) static int adxl355_setup(struct adxl355_data *data) { unsigned int regval; + int retries = 5; /* the number is chosen based on empirical reasons */ int ret; + u8 *shadow_regs __free(kfree) = kzalloc(ADXL355_SHADOW_REG_COUNT, GFP_KERNEL); + + if (!shadow_regs) + return -ENOMEM; ret = regmap_read(data->regmap, ADXL355_DEVID_AD_REG, ®val); if (ret) @@ -321,14 +328,41 @@ static int adxl355_setup(struct adxl355_data *data) if (regval != ADXL355_PARTID_VAL) dev_warn(data->dev, "Invalid DEV ID 0x%02x\n", regval); - /* - * Perform a software reset to make sure the device is in a consistent - * state after start-up. - */ - ret = regmap_write(data->regmap, ADXL355_RESET_REG, ADXL355_RESET_CODE); + /* Read shadow registers to be compared after reset */ + ret = regmap_bulk_read(data->regmap, + ADXL355_BASE_ADDR_SHADOW_REG, + shadow_regs, ADXL355_SHADOW_REG_COUNT); if (ret) return ret; + do { + if (--retries == 0) { + dev_err(data->dev, "Shadow registers mismatch\n"); + return -EIO; + } + + /* + * Perform a software reset to make sure the device is in a consistent + * state after start-up. + */ + ret = regmap_write(data->regmap, ADXL355_RESET_REG, + ADXL355_RESET_CODE); + if (ret) + return ret; + + /* Wait at least 5ms after software reset */ + usleep_range(5000, 10000); + + /* Read shadow registers for comparison */ + ret = regmap_bulk_read(data->regmap, + ADXL355_BASE_ADDR_SHADOW_REG, + data->buffer.buf, + ADXL355_SHADOW_REG_COUNT); + if (ret) + return ret; + } while (memcmp(shadow_regs, data->buffer.buf, + ADXL355_SHADOW_REG_COUNT)); + ret = regmap_update_bits(data->regmap, ADXL355_POWER_CTL_REG, ADXL355_POWER_CTL_DRDY_MSK, FIELD_PREP(ADXL355_POWER_CTL_DRDY_MSK, 1)); diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c index 3c5d1560b163..42ccf0316ce5 100644 --- a/drivers/iio/accel/bmc150-accel-core.c +++ b/drivers/iio/accel/bmc150-accel-core.c @@ -523,6 +523,10 @@ static int bmc150_accel_set_interrupt(struct bmc150_accel_data *data, int i, const struct bmc150_accel_interrupt_info *info = intr->info; int ret; + /* We do not always have an IRQ */ + if (data->irq <= 0) + return 0; + if (state) { if (atomic_inc_return(&intr->users) > 1) return 0; @@ -1696,6 +1700,7 @@ int bmc150_accel_core_probe(struct device *dev, struct regmap *regmap, int irq, } if (irq > 0) { + data->irq = irq; ret = devm_request_threaded_irq(dev, irq, bmc150_accel_irq_handler, bmc150_accel_irq_thread_handler, diff --git a/drivers/iio/accel/bmc150-accel.h b/drivers/iio/accel/bmc150-accel.h index 7a7baf52e595..e8f26198359f 100644 --- a/drivers/iio/accel/bmc150-accel.h +++ b/drivers/iio/accel/bmc150-accel.h @@ -58,6 +58,7 @@ enum bmc150_accel_trigger_id { struct bmc150_accel_data { struct regmap *regmap; + int irq; struct regulator_bulk_data regulators[2]; struct bmc150_accel_interrupt interrupts[BMC150_ACCEL_INTERRUPTS]; struct bmc150_accel_trigger triggers[BMC150_ACCEL_TRIGGERS]; diff --git a/drivers/iio/adc/ad4030.c b/drivers/iio/adc/ad4030.c index 1bc2f9a22470..d8bee6a4215a 100644 --- a/drivers/iio/adc/ad4030.c +++ b/drivers/iio/adc/ad4030.c @@ -385,7 +385,7 @@ static int ad4030_get_chan_scale(struct iio_dev *indio_dev, struct ad4030_state *st = iio_priv(indio_dev); const struct iio_scan_type *scan_type; - scan_type = iio_get_current_scan_type(indio_dev, st->chip->channels); + scan_type = iio_get_current_scan_type(indio_dev, chan); if (IS_ERR(scan_type)) return PTR_ERR(scan_type); diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index 910b40393f77..61623cc6cb25 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -1525,10 +1525,6 @@ static int __ad7124_calibrate_all(struct ad7124_state *st, struct iio_dev *indio int ret, i; for (i = 0; i < st->num_channels; i++) { - - if (indio_dev->channels[i].type != IIO_VOLTAGE) - continue; - /* * For calibration the OFFSET register should hold its reset default * value. For the GAIN register there is no such requirement but @@ -1539,6 +1535,14 @@ static int __ad7124_calibrate_all(struct ad7124_state *st, struct iio_dev *indio st->channels[i].cfg.calibration_gain = st->gain_default; /* + * Only the main voltage input channels are important enough + * to be automatically calibrated here. For everything else, + * just use the default values set above. + */ + if (indio_dev->channels[i].type != IIO_VOLTAGE) + continue; + + /* * Full-scale calibration isn't supported at gain 1, so skip in * that case. Note that untypically full-scale calibration has * to happen before zero-scale calibration. This only applies to diff --git a/drivers/iio/adc/ad7280a.c b/drivers/iio/adc/ad7280a.c index dda2986ccda0..50a6ff7c8b1c 100644 --- a/drivers/iio/adc/ad7280a.c +++ b/drivers/iio/adc/ad7280a.c @@ -541,7 +541,7 @@ static ssize_t ad7280_store_balance_timer(struct iio_dev *indio_dev, int val, val2; int ret; - ret = iio_str_to_fixpoint(buf, 1000, &val, &val2); + ret = iio_str_to_fixpoint(buf, 100, &val, &val2); if (ret) return ret; diff --git a/drivers/iio/adc/ad7380.c b/drivers/iio/adc/ad7380.c index fa251dc1aae6..bfd908deefc0 100644 --- a/drivers/iio/adc/ad7380.c +++ b/drivers/iio/adc/ad7380.c @@ -1227,6 +1227,14 @@ static int ad7380_offload_buffer_postenable(struct iio_dev *indio_dev) if (ret) return ret; + /* + * When the sequencer is required to read all channels, we need to + * trigger twice per sample period in order to read one complete set + * of samples. + */ + if (st->seq) + config.periodic.frequency_hz *= 2; + ret = spi_offload_trigger_enable(st->offload, st->offload_trigger, &config); if (ret) spi_unoptimize_message(&st->offload_msg); diff --git a/drivers/iio/adc/rtq6056.c b/drivers/iio/adc/rtq6056.c index ad9738228b7f..2bf3a09ac6b0 100644 --- a/drivers/iio/adc/rtq6056.c +++ b/drivers/iio/adc/rtq6056.c @@ -300,7 +300,7 @@ static int rtq6056_adc_read_channel(struct rtq6056_priv *priv, return IIO_VAL_INT; case RTQ6056_REG_SHUNTVOLT: case RTQ6056_REG_CURRENT: - *val = sign_extend32(regval, 16); + *val = sign_extend32(regval, 15); return IIO_VAL_INT; default: return -EINVAL; diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c index 74b1b4dc6e81..9664b9bd75d4 100644 --- a/drivers/iio/adc/stm32-dfsdm-adc.c +++ b/drivers/iio/adc/stm32-dfsdm-adc.c @@ -725,9 +725,8 @@ static int stm32_dfsdm_generic_channel_parse_of(struct stm32_dfsdm *dfsdm, } df_ch->src = val; - ret = fwnode_property_read_u32(node, "st,adc-alt-channel", &df_ch->alt_si); - if (ret != -EINVAL) - df_ch->alt_si = 0; + if (fwnode_property_present(node, "st,adc-alt-channel")) + df_ch->alt_si = 1; if (adc->dev_data->type == DFSDM_IIO) { backend = devm_iio_backend_fwnode_get(&indio_dev->dev, NULL, node); diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c index ee294a775e8a..7a7a9d37339b 100644 --- a/drivers/iio/buffer/industrialio-buffer-dma.c +++ b/drivers/iio/buffer/industrialio-buffer-dma.c @@ -786,6 +786,12 @@ out_end_signalling: } EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_enqueue_dmabuf, "IIO_DMA_BUFFER"); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer) +{ + return iio_buffer_to_queue(buffer)->dev; +} +EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_get_dma_dev, "IIO_DMA_BUFFER"); + void iio_dma_buffer_lock_queue(struct iio_buffer *buffer) { struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buffer); diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c index e9d9a7d39fe1..27dd56334345 100644 --- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c +++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c @@ -177,6 +177,8 @@ static const struct iio_buffer_access_funcs iio_dmaengine_buffer_ops = { .lock_queue = iio_dma_buffer_lock_queue, .unlock_queue = iio_dma_buffer_unlock_queue, + .get_dma_dev = iio_dma_buffer_get_dma_dev, + .modes = INDIO_BUFFER_HARDWARE, .flags = INDIO_BUFFER_FLAG_FIXED_WATERMARK, }; diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c index 1e167dc673ca..da09c9f3ceb6 100644 --- a/drivers/iio/common/ssp_sensors/ssp_dev.c +++ b/drivers/iio/common/ssp_sensors/ssp_dev.c @@ -503,7 +503,7 @@ static int ssp_probe(struct spi_device *spi) ret = spi_setup(spi); if (ret < 0) { dev_err(&spi->dev, "Failed to setup spi\n"); - return ret; + goto err_setup_spi; } data->fw_dl_state = SSP_FW_DL_STATE_NONE; @@ -568,6 +568,8 @@ err_read_reg: err_setup_irq: mutex_destroy(&data->pending_lock); mutex_destroy(&data->comm_lock); +err_setup_spi: + mfd_remove_devices(&spi->dev); dev_err(&spi->dev, "Probe failed!\n"); diff --git a/drivers/iio/humidity/hdc3020.c b/drivers/iio/humidity/hdc3020.c index ffb25596d3a8..78b2c171c8da 100644 --- a/drivers/iio/humidity/hdc3020.c +++ b/drivers/iio/humidity/hdc3020.c @@ -72,6 +72,9 @@ #define HDC3020_MAX_TEMP_HYST_MICRO 164748607 #define HDC3020_MAX_HUM_MICRO 99220264 +/* Divide 65535 from the datasheet by 5 to avoid overflows */ +#define HDC3020_THRESH_FRACTION (65535 / 5) + struct hdc3020_data { struct i2c_client *client; struct gpio_desc *reset_gpio; @@ -301,9 +304,9 @@ static int hdc3020_read_raw(struct iio_dev *indio_dev, case IIO_CHAN_INFO_SCALE: *val2 = 65536; if (chan->type == IIO_TEMP) - *val = 175; + *val = 175 * MILLI; else - *val = 100; + *val = 100 * MILLI; return IIO_VAL_FRACTIONAL; case IIO_CHAN_INFO_OFFSET: @@ -376,15 +379,18 @@ static int hdc3020_thresh_get_temp(u16 thresh) int temp; /* - * Get the temperature threshold from 9 LSBs, shift them to get - * the truncated temperature threshold representation and - * calculate the threshold according to the formula in the - * datasheet. Result is degree celsius scaled by 65535. + * Get the temperature threshold from 9 LSBs, shift them to get the + * truncated temperature threshold representation and calculate the + * threshold according to the explicit formula in the datasheet: + * T(C) = -45 + (175 * temp) / 65535. + * Additionally scale by HDC3020_THRESH_FRACTION to avoid precision loss + * when calculating threshold and hysteresis values. Result is degree + * celsius scaled by HDC3020_THRESH_FRACTION. */ temp = FIELD_GET(HDC3020_THRESH_TEMP_MASK, thresh) << HDC3020_THRESH_TEMP_TRUNC_SHIFT; - return -2949075 + (175 * temp); + return -2949075 / 5 + (175 / 5 * temp); } static int hdc3020_thresh_get_hum(u16 thresh) @@ -394,13 +400,16 @@ static int hdc3020_thresh_get_hum(u16 thresh) /* * Get the humidity threshold from 7 MSBs, shift them to get the * truncated humidity threshold representation and calculate the - * threshold according to the formula in the datasheet. Result is - * percent scaled by 65535. + * threshold according to the explicit formula in the datasheet: + * RH(%) = 100 * hum / 65535. + * Additionally scale by HDC3020_THRESH_FRACTION to avoid precision loss + * when calculating threshold and hysteresis values. Result is percent + * scaled by HDC3020_THRESH_FRACTION. */ hum = FIELD_GET(HDC3020_THRESH_HUM_MASK, thresh) << HDC3020_THRESH_HUM_TRUNC_SHIFT; - return hum * 100; + return hum * 100 / 5; } static u16 hdc3020_thresh_set_temp(int s_temp, u16 curr_thresh) @@ -455,8 +464,8 @@ int hdc3020_thresh_clr(s64 s_thresh, s64 s_hyst, enum iio_event_direction dir) else s_clr = s_thresh + s_hyst; - /* Divide by 65535 to get units of micro */ - return div_s64(s_clr, 65535); + /* Divide by HDC3020_THRESH_FRACTION to get units of micro */ + return div_s64(s_clr, HDC3020_THRESH_FRACTION); } static int _hdc3020_write_thresh(struct hdc3020_data *data, u16 reg, u16 val) @@ -507,7 +516,7 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, clr = ret; /* Scale value to include decimal part into calculations */ - s_val = (val < 0) ? (val * 1000000 - val2) : (val * 1000000 + val2); + s_val = (val < 0) ? (val * 1000 - val2) : (val * 1000 + val2); switch (chan->type) { case IIO_TEMP: switch (info) { @@ -523,7 +532,8 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, /* Calculate old hysteresis */ s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000; s_clr = (s64)hdc3020_thresh_get_temp(clr) * 1000000; - s_hyst = div_s64(abs(s_thresh - s_clr), 65535); + s_hyst = div_s64(abs(s_thresh - s_clr), + HDC3020_THRESH_FRACTION); /* Set new threshold */ thresh = reg_val; /* Set old hysteresis */ @@ -532,16 +542,17 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, case IIO_EV_INFO_HYSTERESIS: /* * Function hdc3020_thresh_get_temp returns temperature - * in degree celsius scaled by 65535. Scale by 1000000 - * to be able to subtract scaled hysteresis value. + * in degree celsius scaled by HDC3020_THRESH_FRACTION. + * Scale by 1000000 to be able to subtract scaled + * hysteresis value. */ s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000; /* * Units of s_val are in micro degree celsius, scale by - * 65535 to get same units as s_thresh. + * HDC3020_THRESH_FRACTION to get same units as s_thresh. */ s_val = min(abs(s_val), HDC3020_MAX_TEMP_HYST_MICRO); - s_hyst = (s64)s_val * 65535; + s_hyst = (s64)s_val * HDC3020_THRESH_FRACTION; s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir); s_clr = max(s_clr, HDC3020_MIN_TEMP_MICRO); s_clr = min(s_clr, HDC3020_MAX_TEMP_MICRO); @@ -565,7 +576,8 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, /* Calculate old hysteresis */ s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000; s_clr = (s64)hdc3020_thresh_get_hum(clr) * 1000000; - s_hyst = div_s64(abs(s_thresh - s_clr), 65535); + s_hyst = div_s64(abs(s_thresh - s_clr), + HDC3020_THRESH_FRACTION); /* Set new threshold */ thresh = reg_val; /* Try to set old hysteresis */ @@ -574,15 +586,16 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev, case IIO_EV_INFO_HYSTERESIS: /* * Function hdc3020_thresh_get_hum returns relative - * humidity in percent scaled by 65535. Scale by 1000000 - * to be able to subtract scaled hysteresis value. + * humidity in percent scaled by HDC3020_THRESH_FRACTION. + * Scale by 1000000 to be able to subtract scaled + * hysteresis value. */ s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000; /* - * Units of s_val are in micro percent, scale by 65535 - * to get same units as s_thresh. + * Units of s_val are in micro percent, scale by + * HDC3020_THRESH_FRACTION to get same units as s_thresh. */ - s_hyst = (s64)s_val * 65535; + s_hyst = (s64)s_val * HDC3020_THRESH_FRACTION; s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir); s_clr = max(s_clr, 0); s_clr = min(s_clr, HDC3020_MAX_HUM_MICRO); @@ -630,7 +643,7 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev, thresh = hdc3020_thresh_get_temp(ret); switch (info) { case IIO_EV_INFO_VALUE: - *val = thresh; + *val = thresh * MILLI; break; case IIO_EV_INFO_HYSTERESIS: ret = hdc3020_read_be16(data, reg_clr); @@ -638,18 +651,18 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev, return ret; clr = hdc3020_thresh_get_temp(ret); - *val = abs(thresh - clr); + *val = abs(thresh - clr) * MILLI; break; default: return -EOPNOTSUPP; } - *val2 = 65535; + *val2 = HDC3020_THRESH_FRACTION; return IIO_VAL_FRACTIONAL; case IIO_HUMIDITYRELATIVE: thresh = hdc3020_thresh_get_hum(ret); switch (info) { case IIO_EV_INFO_VALUE: - *val = thresh; + *val = thresh * MILLI; break; case IIO_EV_INFO_HYSTERESIS: ret = hdc3020_read_be16(data, reg_clr); @@ -657,12 +670,12 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev, return ret; clr = hdc3020_thresh_get_hum(ret); - *val = abs(thresh - clr); + *val = abs(thresh - clr) * MILLI; break; default: return -EOPNOTSUPP; } - *val2 = 65535; + *val2 = HDC3020_THRESH_FRACTION; return IIO_VAL_FRACTIONAL; default: return -EOPNOTSUPP; diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h index c225b246c8a5..381b016fa524 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h @@ -192,6 +192,22 @@ struct st_lsm6dsx_fifo_ops { * @fifo_en: Hw timer FIFO enable register info (addr + mask). * @decimator: Hw timer FIFO decimator register info (addr + mask). * @freq_fine: Difference in % of ODR with respect to the typical. + * @ts_sensitivity: Nominal timestamp sensitivity. + * @ts_trim_coeff: Coefficient for calculating the calibrated timestamp gain. + * This coefficient comes into play when linearizing the formula + * used to calculate the calibrated timestamp (please see the + * relevant formula in the AN for the specific IMU). + * For example, in the case of LSM6DSO we have: + * + * 1 / (1 + x) ~= 1 - x (Taylor’s Series) + * ttrim[s] = 1 / (40000 * (1 + 0.0015 * val)) (from AN5192) + * ttrim[ns] ~= 25000 - 37.5 * val + * ttrim[ns] ~= 25000 - (37500 * val) / 1000 + * + * so, replacing ts_sensitivity = 25000 and + * ts_trim_coeff = 37500 + * + * ttrim[ns] ~= ts_sensitivity - (ts_trim_coeff * val) / 1000 */ struct st_lsm6dsx_hw_ts_settings { struct st_lsm6dsx_reg timer_en; @@ -199,6 +215,8 @@ struct st_lsm6dsx_hw_ts_settings { struct st_lsm6dsx_reg fifo_en; struct st_lsm6dsx_reg decimator; u8 freq_fine; + u16 ts_sensitivity; + u16 ts_trim_coeff; }; /** @@ -252,6 +270,15 @@ struct st_lsm6dsx_event_settings { u8 wakeup_src_x_mask; }; +enum st_lsm6dsx_sensor_id { + ST_LSM6DSX_ID_GYRO, + ST_LSM6DSX_ID_ACC, + ST_LSM6DSX_ID_EXT0, + ST_LSM6DSX_ID_EXT1, + ST_LSM6DSX_ID_EXT2, + ST_LSM6DSX_ID_MAX +}; + enum st_lsm6dsx_ext_sensor_id { ST_LSM6DSX_ID_MAGN, }; @@ -337,23 +364,14 @@ struct st_lsm6dsx_settings { struct st_lsm6dsx_odr_table_entry odr_table[2]; struct st_lsm6dsx_samples_to_discard samples_to_discard[2]; struct st_lsm6dsx_fs_table_entry fs_table[2]; - struct st_lsm6dsx_reg decimator[ST_LSM6DSX_MAX_ID]; - struct st_lsm6dsx_reg batch[ST_LSM6DSX_MAX_ID]; + struct st_lsm6dsx_reg decimator[ST_LSM6DSX_ID_MAX]; + struct st_lsm6dsx_reg batch[2]; struct st_lsm6dsx_fifo_ops fifo_ops; struct st_lsm6dsx_hw_ts_settings ts_settings; struct st_lsm6dsx_shub_settings shub_settings; struct st_lsm6dsx_event_settings event_settings; }; -enum st_lsm6dsx_sensor_id { - ST_LSM6DSX_ID_GYRO, - ST_LSM6DSX_ID_ACC, - ST_LSM6DSX_ID_EXT0, - ST_LSM6DSX_ID_EXT1, - ST_LSM6DSX_ID_EXT2, - ST_LSM6DSX_ID_MAX, -}; - enum st_lsm6dsx_fifo_mode { ST_LSM6DSX_FIFO_BYPASS = 0x0, ST_LSM6DSX_FIFO_CONT = 0x6, diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c index d8cb4b0218d5..a2daf0c14d96 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c @@ -94,8 +94,6 @@ #define ST_LSM6DSX_REG_WHOAMI_ADDR 0x0f -#define ST_LSM6DSX_TS_SENSITIVITY 25000UL /* 25us */ - static const struct iio_chan_spec st_lsm6dsx_acc_channels[] = { ST_LSM6DSX_CHANNEL_ACC(IIO_ACCEL, 0x28, IIO_MOD_X, 0), ST_LSM6DSX_CHANNEL_ACC(IIO_ACCEL, 0x2a, IIO_MOD_Y, 1), @@ -983,6 +981,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .mask = GENMASK(7, 6), }, .freq_fine = 0x63, + .ts_sensitivity = 25000, + .ts_trim_coeff = 37500, }, .shub_settings = { .page_mux = { @@ -1196,6 +1196,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .mask = GENMASK(7, 6), }, .freq_fine = 0x63, + .ts_sensitivity = 25000, + .ts_trim_coeff = 37500, }, .event_settings = { .enable_reg = { @@ -1371,6 +1373,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = { .mask = GENMASK(7, 6), }, .freq_fine = 0x4f, + .ts_sensitivity = 21701, + .ts_trim_coeff = 28212, }, .shub_settings = { .page_mux = { @@ -2248,20 +2252,13 @@ static int st_lsm6dsx_init_hw_timer(struct st_lsm6dsx_hw *hw) } /* calibrate timestamp sensitivity */ - hw->ts_gain = ST_LSM6DSX_TS_SENSITIVITY; + hw->ts_gain = ts_settings->ts_sensitivity; if (ts_settings->freq_fine) { err = regmap_read(hw->regmap, ts_settings->freq_fine, &val); if (err < 0) return err; - /* - * linearize the AN5192 formula: - * 1 / (1 + x) ~= 1 - x (Taylor’s Series) - * ttrim[s] = 1 / (40000 * (1 + 0.0015 * val)) - * ttrim[ns] ~= 25000 - 37.5 * val - * ttrim[ns] ~= 25000 - (37500 * val) / 1000 - */ - hw->ts_gain -= ((s8)val * 37500) / 1000; + hw->ts_gain -= ((s8)val * ts_settings->ts_trim_coeff) / 1000; } return 0; diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index a80f7cc25a27..96ea0f039dfb 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -1623,19 +1623,28 @@ static int iio_dma_resv_lock(struct dma_buf *dmabuf, bool nonblock) return 0; } +static struct device *iio_buffer_get_dma_dev(const struct iio_dev *indio_dev, + struct iio_buffer *buffer) +{ + if (buffer->access->get_dma_dev) + return buffer->access->get_dma_dev(buffer); + + return indio_dev->dev.parent; +} + static struct dma_buf_attachment * iio_buffer_find_attachment(struct iio_dev_buffer_pair *ib, struct dma_buf *dmabuf, bool nonblock) { - struct device *dev = ib->indio_dev->dev.parent; struct iio_buffer *buffer = ib->buffer; + struct device *dma_dev = iio_buffer_get_dma_dev(ib->indio_dev, buffer); struct dma_buf_attachment *attach = NULL; struct iio_dmabuf_priv *priv; guard(mutex)(&buffer->dmabufs_mutex); list_for_each_entry(priv, &buffer->dmabufs, entry) { - if (priv->attach->dev == dev + if (priv->attach->dev == dma_dev && priv->attach->dmabuf == dmabuf) { attach = priv->attach; break; @@ -1653,6 +1662,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, { struct iio_dev *indio_dev = ib->indio_dev; struct iio_buffer *buffer = ib->buffer; + struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer); struct dma_buf_attachment *attach; struct iio_dmabuf_priv *priv, *each; struct dma_buf *dmabuf; @@ -1679,7 +1689,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, goto err_free_priv; } - attach = dma_buf_attach(dmabuf, indio_dev->dev.parent); + attach = dma_buf_attach(dmabuf, dma_dev); if (IS_ERR(attach)) { err = PTR_ERR(attach); goto err_dmabuf_put; @@ -1719,7 +1729,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib, * combo. If we do, refuse to attach. */ list_for_each_entry(each, &buffer->dmabufs, entry) { - if (each->attach->dev == indio_dev->dev.parent + if (each->attach->dev == dma_dev && each->attach->dmabuf == dmabuf) { /* * We unlocked the reservation object, so going through @@ -1758,6 +1768,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib, { struct iio_buffer *buffer = ib->buffer; struct iio_dev *indio_dev = ib->indio_dev; + struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer); struct iio_dmabuf_priv *priv; struct dma_buf *dmabuf; int dmabuf_fd, ret = -EPERM; @@ -1772,7 +1783,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib, guard(mutex)(&buffer->dmabufs_mutex); list_for_each_entry(priv, &buffer->dmabufs, entry) { - if (priv->attach->dev == indio_dev->dev.parent + if (priv->attach->dev == dma_dev && priv->attach->dmabuf == dmabuf) { list_del(&priv->entry); diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c index c04e8bb4c993..d983ce9c0b99 100644 --- a/drivers/iio/pressure/bmp280-core.c +++ b/drivers/iio/pressure/bmp280-core.c @@ -1040,13 +1040,16 @@ static int bmp280_wait_conv(struct bmp280_data *data) unsigned int reg, meas_time_us; int ret; - /* Check if we are using a BME280 device */ - if (data->oversampling_humid) - meas_time_us = BMP280_PRESS_HUMID_MEAS_OFFSET + - BIT(data->oversampling_humid) * BMP280_MEAS_DUR; + /* Constant part of the measurement time */ + meas_time_us = BMP280_MEAS_OFFSET; - else - meas_time_us = 0; + /* + * Check if we are using a BME280 device, + * Humidity measurement time + */ + if (data->chip_info->oversampling_humid_avail) + meas_time_us += BMP280_PRESS_HUMID_MEAS_OFFSET + + BIT(data->oversampling_humid) * BMP280_MEAS_DUR; /* Pressure measurement time */ meas_time_us += BMP280_PRESS_HUMID_MEAS_OFFSET + diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 37cd37556510..fab5d914029d 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -206,6 +206,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return ret; err_free: + ib_umem_release(umem); rdma_restrack_put(&cq->res); kfree(cq); err_event_file: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 4dab5ca7362b..84ce3fce2826 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -913,7 +913,7 @@ void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, spin_unlock_irqrestore(&qp->scq->cq_lock, flags); } -static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp) +static void bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp) { struct bnxt_re_qp *gsi_sqp; struct bnxt_re_ah *gsi_sah; @@ -933,10 +933,9 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp) ibdev_dbg(&rdev->ibdev, "Destroy the shadow QP\n"); rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &gsi_sqp->qplib_qp); - if (rc) { + if (rc) ibdev_err(&rdev->ibdev, "Destroy Shadow QP failed"); - goto fail; - } + bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp); /* remove from active qp list */ @@ -951,10 +950,6 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp) rdev->gsi_ctx.gsi_sqp = NULL; rdev->gsi_ctx.gsi_sah = NULL; rdev->gsi_ctx.sqp_tbl = NULL; - - return 0; -fail: - return rc; } static void bnxt_re_del_unique_gid(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index d9a12681f843..22d3e25c3b9d 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1216,13 +1216,13 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (umem->length < cq->size) { ibdev_dbg(&dev->ibdev, "External memory too small\n"); err = -EINVAL; - goto err_free_mem; + goto err_out; } if (!ib_umem_is_contiguous(umem)) { ibdev_dbg(&dev->ibdev, "Non contiguous CQ unsupported\n"); err = -EINVAL; - goto err_free_mem; + goto err_out; } cq->cpu_addr = NULL; @@ -1251,7 +1251,7 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, err = efa_com_create_cq(&dev->edev, ¶ms, &result); if (err) - goto err_free_mem; + goto err_free_mapped; resp.db_off = result.db_off; resp.cq_idx = result.cq_idx; @@ -1299,12 +1299,10 @@ err_remove_mmap: efa_cq_user_mmap_entries_remove(cq); err_destroy_cq: efa_destroy_cq_idx(dev, cq->cq_idx); -err_free_mem: - if (umem) - ib_umem_release(umem); - else - efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE); - +err_free_mapped: + if (!umem) + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, + DMA_FROM_DEVICE); err_out: atomic64_inc(&dev->stats.create_cq_err); return err; diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 3a5c93c9fb3e..6aa82fe9dd3d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include <linux/pci.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> #include "hns_roce_device.h" @@ -37,6 +38,43 @@ #include "hns_roce_hem.h" #include "hns_roce_common.h" +void hns_roce_put_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; + + if (hr_dev->pci_dev->revision < PCI_REVISION_ID_HIP09) + return; + + mutex_lock(&cq_table->bank_mutex); + cq_table->ctx_num[uctx->cq_bank_id]--; + mutex_unlock(&cq_table->bank_mutex); +} + +void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); + struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; + u32 least_load = cq_table->ctx_num[0]; + u8 bankid = 0; + u8 i; + + if (hr_dev->pci_dev->revision < PCI_REVISION_ID_HIP09) + return; + + mutex_lock(&cq_table->bank_mutex); + for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) { + if (cq_table->ctx_num[i] < least_load) { + least_load = cq_table->ctx_num[i]; + bankid = i; + } + } + cq_table->ctx_num[bankid]++; + mutex_unlock(&cq_table->bank_mutex); + + uctx->cq_bank_id = bankid; +} + static u8 get_least_load_bankid_for_cq(struct hns_roce_bank *bank) { u32 least_load = bank[0].inuse; @@ -55,7 +93,21 @@ static u8 get_least_load_bankid_for_cq(struct hns_roce_bank *bank) return bankid; } -static int alloc_cqn(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) +static u8 select_cq_bankid(struct hns_roce_dev *hr_dev, + struct hns_roce_bank *bank, struct ib_udata *udata) +{ + struct hns_roce_ucontext *uctx = udata ? + rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, + ibucontext) : NULL; + + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + return uctx ? uctx->cq_bank_id : 0; + + return get_least_load_bankid_for_cq(bank); +} + +static int alloc_cqn(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, + struct ib_udata *udata) { struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; struct hns_roce_bank *bank; @@ -63,7 +115,7 @@ static int alloc_cqn(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) int id; mutex_lock(&cq_table->bank_mutex); - bankid = get_least_load_bankid_for_cq(cq_table->bank); + bankid = select_cq_bankid(hr_dev, cq_table->bank, udata); bank = &cq_table->bank[bankid]; id = ida_alloc_range(&bank->ida, bank->min, bank->max, GFP_KERNEL); @@ -396,7 +448,7 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, goto err_cq_buf; } - ret = alloc_cqn(hr_dev, hr_cq); + ret = alloc_cqn(hr_dev, hr_cq, udata); if (ret) { ibdev_err(ibdev, "failed to alloc CQN, ret = %d.\n", ret); goto err_cq_db; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 78ee04a48a74..06832c0ac055 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -217,6 +217,7 @@ struct hns_roce_ucontext { struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; u32 config; + u8 cq_bank_id; }; struct hns_roce_pd { @@ -495,6 +496,7 @@ struct hns_roce_cq_table { struct hns_roce_hem_table table; struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM]; struct mutex bank_mutex; + u32 ctx_num[HNS_ROCE_CQ_BANK_NUM]; }; struct hns_roce_srq_table { @@ -1305,5 +1307,7 @@ hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address, size_t length, enum hns_roce_mmap_type mmap_type); bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl); +void hns_roce_put_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx); +void hns_roce_get_cq_bankid_for_uctx(struct hns_roce_ucontext *uctx); #endif /* _HNS_ROCE_DEVICE_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f82bdd46a917..63052c0e7613 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -165,6 +165,8 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, hr_reg_write(fseg, FRMR_PBL_BUF_PG_SZ, to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); hr_reg_clear(fseg, FRMR_BLK_MODE); + hr_reg_clear(fseg, FRMR_BLOCK_SIZE); + hr_reg_clear(fseg, FRMR_ZBVA); } static void set_atomic_seg(const struct ib_send_wr *wr, @@ -339,9 +341,6 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, int j = 0; int i; - hr_reg_write(rc_sq_wqe, RC_SEND_WQE_MSG_START_SGE_IDX, - (*sge_ind) & (qp->sge.sge_cnt - 1)); - hr_reg_write(rc_sq_wqe, RC_SEND_WQE_INLINE, !!(wr->send_flags & IB_SEND_INLINE)); if (wr->send_flags & IB_SEND_INLINE) @@ -586,6 +585,9 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_CQE, (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_MSG_START_SGE_IDX, + curr_idx & (qp->sge.sge_cnt - 1)); + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { if (msg_len != ATOMIC_WR_LEN) @@ -734,6 +736,9 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, owner_bit = ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1); + /* RC and UD share the same DirectWQE field layout */ + ((struct hns_roce_v2_rc_send_wqe *)wqe)->byte_4 = 0; + /* Corresponding to the QP type, wqe process separately */ if (ibqp->qp_type == IB_QPT_RC) ret = set_rc_wqe(qp, wr, wqe, &sge_idx, owner_bit); @@ -7048,7 +7053,6 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) goto error_failed_roce_init; } - handle->priv = hr_dev; return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d50f36f8a110..f3607fe107a7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -425,6 +425,8 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, if (ret) goto error_fail_copy_to_udata; + hns_roce_get_cq_bankid_for_uctx(context); + return 0; error_fail_copy_to_udata: @@ -447,6 +449,8 @@ static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_dev *hr_dev = to_hr_dev(ibcontext->device); + hns_roce_put_cq_bankid_for_uctx(context); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_CQ_RECORD_DB || hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) mutex_destroy(&context->page_mutex); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 6ff1b8ce580c..bdd879ac12dd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -662,7 +662,6 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; - cap->max_send_sge = hr_qp->sq.max_gs; return 0; } @@ -744,7 +743,6 @@ static int set_kernel_sq_size(struct hns_roce_dev *hr_dev, /* sync the parameters of kernel QP to user's configuration */ cap->max_send_wr = cnt; - cap->max_send_sge = hr_qp->sq.max_gs; return 0; } diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index 3091f9345f12..fa6325adaede 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -71,7 +71,7 @@ int irdma_hmc_init_pble(struct irdma_sc_dev *dev, static void get_sd_pd_idx(struct irdma_hmc_pble_rsrc *pble_rsrc, struct sd_pd_idx *idx) { - idx->sd_idx = (u32)pble_rsrc->next_fpm_addr / IRDMA_HMC_DIRECT_BP_SIZE; + idx->sd_idx = pble_rsrc->next_fpm_addr / IRDMA_HMC_DIRECT_BP_SIZE; idx->pd_idx = (u32)(pble_rsrc->next_fpm_addr / IRDMA_HMC_PAGED_BP_SIZE); idx->rel_pd_idx = (idx->pd_idx % IRDMA_HMC_PD_CNT_IN_SD); } diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 4ae77cdde9dc..c1b8f81ea283 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -706,7 +706,7 @@ struct irdma_sc_dev { u32 vchnl_ver; u16 num_vfs; u16 hmc_fn_id; - u8 vf_id; + u16 vf_id; bool privileged:1; bool vchnl_up:1; bool ceq_valid:1; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 76ce6137f2ba..c883c9ea5a83 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2503,6 +2503,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, spin_lock_init(&iwcq->lock); INIT_LIST_HEAD(&iwcq->resize_list); INIT_LIST_HEAD(&iwcq->cmpl_generated); + iwcq->cq_num = cq_num; info.dev = dev; ukinfo->cq_size = max(entries, 4); ukinfo->cq_id = cq_num; diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index ed21c1b56e8e..ac8b38701835 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -140,7 +140,7 @@ struct irdma_srq { struct irdma_cq { struct ib_cq ibcq; struct irdma_sc_cq sc_cq; - u16 cq_num; + u32 cq_num; bool user_mode; atomic_t armed; enum irdma_cmpl_notify last_notify; diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a23b364e24ff..651d76bca114 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1020,15 +1020,18 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); + if (udata) { + cq->mcq.comp = mlx5_add_cq_to_tasklet; + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + } else { + cq->mcq.comp = mlx5_ib_cq_comp; + } + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out)); if (err) goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - if (udata) - cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; - else - cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h index 1d7fc3226bca..cfb42a8f5768 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h @@ -53,6 +53,10 @@ extern void usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node, struct rb_root_cached *root); extern struct usnic_uiom_interval_node * +usnic_uiom_interval_tree_subtree_search(struct usnic_uiom_interval_node *node, + unsigned long start, + unsigned long last); +extern struct usnic_uiom_interval_node * usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index f7209c8ebbcc..1c6b0461dc35 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -261,6 +261,12 @@ static int cros_ec_keyb_work(struct notifier_block *nb, case EC_MKBP_EVENT_KEY_MATRIX: pm_wakeup_event(ckdev->dev, 0); + if (!ckdev->idev) { + dev_warn_once(ckdev->dev, + "Unexpected key matrix event\n"); + return NOTIFY_OK; + } + if (ckdev->ec->event_size != ckdev->cols) { dev_err(ckdev->dev, "Discarded incomplete key matrix event.\n"); diff --git a/drivers/input/keyboard/imx_sc_key.c b/drivers/input/keyboard/imx_sc_key.c index d18839f1f4f6..b620cd310cdb 100644 --- a/drivers/input/keyboard/imx_sc_key.c +++ b/drivers/input/keyboard/imx_sc_key.c @@ -158,7 +158,7 @@ static int imx_sc_key_probe(struct platform_device *pdev) return error; } - error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, &priv); + error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, priv); if (error) return error; diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c index 8d6b71d59793..eabb4a0b8a0d 100644 --- a/drivers/input/tablet/pegasus_notetaker.c +++ b/drivers/input/tablet/pegasus_notetaker.c @@ -63,6 +63,9 @@ #define BUTTON_PRESSED 0xb5 #define COMMAND_VERSION 0xa9 +/* 1 Status + 1 Color + 2 X + 2 Y = 6 bytes */ +#define NOTETAKER_PACKET_SIZE 6 + /* in xy data packet */ #define BATTERY_NO_REPORT 0x40 #define BATTERY_LOW 0x41 @@ -311,6 +314,12 @@ static int pegasus_probe(struct usb_interface *intf, } pegasus->data_len = usb_maxpacket(dev, pipe); + if (pegasus->data_len < NOTETAKER_PACKET_SIZE) { + dev_err(&intf->dev, "packet size is too small (%d)\n", + pegasus->data_len); + error = -EINVAL; + goto err_free_mem; + } pegasus->data = usb_alloc_coherent(dev, pegasus->data_len, GFP_KERNEL, &pegasus->data_dma); diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index 252dcae039f8..f8798d11ec03 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -796,17 +796,6 @@ int goodix_reset_no_int_sync(struct goodix_ts_data *ts) usleep_range(6000, 10000); /* T4: > 5ms */ - /* - * Put the reset pin back in to input / high-impedance mode to save - * power. Only do this in the non ACPI case since some ACPI boards - * don't have a pull-up, so there the reset pin must stay active-high. - */ - if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_GPIO) { - error = gpiod_direction_input(ts->gpiod_rst); - if (error) - goto error; - } - return 0; error: @@ -957,14 +946,6 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts) return -EINVAL; } - /* - * Normally we put the reset pin in input / high-impedance mode to save - * power. But some x86/ACPI boards don't have a pull-up, so for the ACPI - * case, leave the pin as is. This results in the pin not being touched - * at all on x86/ACPI boards, except when needed for error-recover. - */ - ts->gpiod_rst_flags = GPIOD_ASIS; - return devm_acpi_dev_add_driver_gpios(dev, gpio_mapping); } #else @@ -989,12 +970,6 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts) return -EINVAL; dev = &ts->client->dev; - /* - * By default we request the reset pin as input, leaving it in - * high-impedance when not resetting the controller to save power. - */ - ts->gpiod_rst_flags = GPIOD_IN; - ts->avdd28 = devm_regulator_get(dev, "AVDD28"); if (IS_ERR(ts->avdd28)) return dev_err_probe(dev, PTR_ERR(ts->avdd28), "Failed to get AVDD28 regulator\n"); @@ -1019,7 +994,7 @@ retry_get_irq_gpio: ts->gpiod_int = gpiod; /* Get the reset line GPIO pin number */ - gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, ts->gpiod_rst_flags); + gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, GPIOD_ASIS); if (IS_ERR(gpiod)) return dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get %s GPIO\n", GOODIX_GPIO_RST_NAME); @@ -1557,6 +1532,7 @@ MODULE_DEVICE_TABLE(i2c, goodix_ts_id); static const struct acpi_device_id goodix_acpi_match[] = { { "GDIX1001", 0 }, { "GDIX1002", 0 }, + { "GDIX1003", 0 }, { "GDX9110", 0 }, { } }; diff --git a/drivers/input/touchscreen/goodix.h b/drivers/input/touchscreen/goodix.h index 87797cc88b32..0d1e8a8d2cba 100644 --- a/drivers/input/touchscreen/goodix.h +++ b/drivers/input/touchscreen/goodix.h @@ -88,7 +88,6 @@ struct goodix_ts_data { struct gpio_desc *gpiod_rst; int gpio_count; int gpio_int_idx; - enum gpiod_flags gpiod_rst_flags; char id[GOODIX_ID_MAX_LEN + 1]; char cfg_name[64]; u16 version; diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7944a3af4545..f1fb27681b0b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -2008,7 +2008,7 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, end - addr, iovad->granule - iova_start_pad); if (!dev_is_dma_coherent(dev) && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_cpu(phys, len, dir); swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs); @@ -2032,7 +2032,8 @@ static void __iommu_dma_iova_unlink(struct device *dev, size_t unmapped; if ((state->__size & DMA_IOVA_USE_SWIOTLB) || - (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))) + (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))) iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs); iommu_iotlb_gather_init(&iotlb_gather); diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 6f1010da221c..21d4a35538f6 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -161,8 +161,8 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, vevent = &veventq->lost_events_header; goto out_set_header; } - memcpy(vevent->event_data, event_data, data_len); vevent->data_len = data_len; + memcpy(vevent->event_data, event_data, data_len); veventq->num_events++; out_set_header: diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index c0360c450880..75d60f2ad900 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -707,7 +707,8 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, struct iopt_area *area; unsigned long unmapped_bytes = 0; unsigned int tries = 0; - int rc = -ENOENT; + /* If there are no mapped entries then success */ + int rc = 0; /* * The domains_rwsem must be held in read mode any time any area->pages @@ -777,8 +778,6 @@ again: down_write(&iopt->iova_rwsem); } - if (unmapped_bytes) - rc = 0; out_unlock_iova: up_write(&iopt->iova_rwsem); @@ -815,13 +814,8 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) { - int rc; - - rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); /* If the IOVAs are empty then unmap all succeeds */ - if (rc == -ENOENT) - return 0; - return rc; + return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); } /* The caller must always free all the nodes in the allowed_iova rb_root. */ diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 1542c5fd10a8..459a7c516915 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -367,6 +367,10 @@ int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) &unmapped); if (rc) goto out_put; + if (!unmapped) { + rc = -ENOENT; + goto out_put; + } } cmd->length = unmapped; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 627f9b78483a..85d0843ed07b 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -614,7 +614,6 @@ struct iommufd_veventq { struct iommufd_eventq common; struct iommufd_viommu *viommu; struct list_head node; /* for iommufd_viommu::veventqs */ - struct iommufd_vevent lost_events_header; enum iommu_veventq_type type; unsigned int depth; @@ -622,6 +621,9 @@ struct iommufd_veventq { /* Use common.lock for protection */ u32 num_events; u32 sequence; + + /* Must be last as it ends in a flexible-array member. */ + struct iommufd_vevent lost_events_header; }; static inline struct iommufd_veventq * diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 4514575818fc..b5b67a9d3fb3 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -130,9 +130,8 @@ struct iova_bitmap { static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, unsigned long iova) { - unsigned long pgsize = 1UL << bitmap->mapped.pgshift; - - return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); + return (iova >> bitmap->mapped.pgshift) / + BITS_PER_TYPE(*bitmap->bitmap); } /* diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index a61c6dc63c29..f334f49c9791 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -36,7 +36,6 @@ config GIC_NON_BANKED config ARM_GIC_V3 bool select IRQ_DOMAIN_HIERARCHY - select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select HAVE_ARM_SMCCC_DISCOVERY select IRQ_MSI_IOMMU @@ -151,7 +150,7 @@ config BCM6345_L1_IRQ config BCM7038_L1_IRQ tristate "Broadcom STB 7038-style L1/L2 interrupt controller driver" - depends on ARCH_BRCMSTB || BMIPS_GENERIC + depends on ARCH_BRCMSTB || BMIPS_GENERIC || COMPILE_TEST default ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN @@ -159,14 +158,14 @@ config BCM7038_L1_IRQ config BCM7120_L2_IRQ tristate "Broadcom STB 7120-style L2 interrupt controller driver" - depends on ARCH_BRCMSTB || BMIPS_GENERIC + depends on ARCH_BRCMSTB || BMIPS_GENERIC || COMPILE_TEST default ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN config BRCMSTB_L2_IRQ tristate "Broadcom STB generic L2 interrupt controller driver" - depends on ARCH_BCM2835 || ARCH_BRCMSTB || BMIPS_GENERIC + depends on ARCH_BCM2835 || ARCH_BRCMSTB || BMIPS_GENERIC || COMPILE_TEST default ARCH_BCM2835 || ARCH_BRCMSTB || BMIPS_GENERIC select GENERIC_IRQ_CHIP select IRQ_DOMAIN @@ -451,9 +450,6 @@ config LS_SCFG_MSI depends on PCI_MSI select IRQ_MSI_LIB -config PARTITION_PERCPU - bool - config STM32MP_EXTI tristate "STM32MP extended interrupts and event controller" depends on (ARCH_STM32 && !ARM_SINGLE_ARMV7M) || COMPILE_TEST diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 3de083f5484c..6a229443efe0 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_ARM_GIC_V3) += irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o obj-$(CONFIG_ARM_GIC_ITS_PARENT) += irq-gic-its-msi-parent.o obj-$(CONFIG_ARM_GIC_V3_ITS) += irq-gic-v3-its.o irq-gic-v4.o obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC) += irq-gic-v3-its-fsl-mc-msi.o -obj-$(CONFIG_PARTITION_PERCPU) += irq-partition-percpu.o obj-$(CONFIG_ARM_GIC_V5) += irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o \ irq-gic-v5-iwb.o obj-$(CONFIG_HISILICON_IRQ_MBIGEN) += irq-mbigen.o diff --git a/drivers/irqchip/irq-aclint-sswi.c b/drivers/irqchip/irq-aclint-sswi.c index 93e28e9f281f..fee30f3bc5ac 100644 --- a/drivers/irqchip/irq-aclint-sswi.c +++ b/drivers/irqchip/irq-aclint-sswi.c @@ -175,7 +175,8 @@ static int __init generic_aclint_sswi_early_probe(struct device_node *node, { return generic_aclint_sswi_probe(&node->fwnode); } -IRQCHIP_DECLARE(generic_aclint_sswi, "mips,p8700-aclint-sswi", generic_aclint_sswi_early_probe); +IRQCHIP_DECLARE(mips_p8700_sswi, "mips,p8700-aclint-sswi", generic_aclint_sswi_early_probe); +IRQCHIP_DECLARE(nuclei_ux900_sswi, "nuclei,ux900-aclint-sswi", generic_aclint_sswi_early_probe); /* THEAD variant */ #define THEAD_C9XX_CSR_SXSTATUS 0x5c0 diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index 032d66dceb8e..795b3db4554a 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -578,16 +578,9 @@ static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs) } if ((read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT)) == - (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) { - int irq; - if (cpumask_test_cpu(smp_processor_id(), - &aic_irqc->fiq_aff[AIC_CPU_PMU_P]->aff)) - irq = AIC_CPU_PMU_P; - else - irq = AIC_CPU_PMU_E; + (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) generic_handle_domain_irq(aic_irqc->hw_domain, - AIC_FIQ_HWIRQ(irq)); - } + AIC_FIQ_HWIRQ(AIC_CPU_PMU_P)); if (static_branch_likely(&use_fast_ipi) && (FIELD_GET(UPMCR0_IMODE, read_sysreg_s(SYS_IMP_APL_UPMCR0_EL1)) == UPMCR0_IMODE_FIQ) && @@ -632,18 +625,7 @@ static int aic_irq_domain_map(struct irq_domain *id, unsigned int irq, handle_fasteoi_irq, NULL, NULL); irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(irq))); } else { - int fiq = FIELD_GET(AIC_EVENT_NUM, hw); - - switch (fiq) { - case AIC_CPU_PMU_P: - case AIC_CPU_PMU_E: - irq_set_percpu_devid_partition(irq, &ic->fiq_aff[fiq]->aff); - break; - default: - irq_set_percpu_devid(irq); - break; - } - + irq_set_percpu_devid(irq); irq_domain_set_info(id, irq, hw, &fiq_chip, id->host_data, handle_percpu_devid_irq, NULL, NULL); } @@ -651,6 +633,33 @@ static int aic_irq_domain_map(struct irq_domain *id, unsigned int irq, return 0; } +static int aic_irq_get_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + const struct cpumask *mask; + u32 intid; + + info->flags = 0; + info->affinity = NULL; + + if (fwspec->param[0] != AIC_FIQ) + return 0; + + if (fwspec->param_count == 3) + intid = fwspec->param[1]; + else + intid = fwspec->param[2]; + + if (aic_irqc->fiq_aff[intid]) + mask = &aic_irqc->fiq_aff[intid]->aff; + else + mask = cpu_possible_mask; + + info->affinity = mask; + info->flags = IRQ_FWSPEC_INFO_AFFINITY_VALID; + + return 0; +} + static int aic_irq_domain_translate(struct irq_domain *id, struct irq_fwspec *fwspec, unsigned long *hwirq, @@ -705,6 +714,10 @@ static int aic_irq_domain_translate(struct irq_domain *id, break; } } + + /* Merge the two PMUs on a single interrupt */ + if (*hwirq == AIC_CPU_PMU_E) + *hwirq = AIC_CPU_PMU_P; break; default: return -EINVAL; @@ -750,9 +763,10 @@ static void aic_irq_domain_free(struct irq_domain *domain, unsigned int virq, } static const struct irq_domain_ops aic_irq_domain_ops = { - .translate = aic_irq_domain_translate, - .alloc = aic_irq_domain_alloc, - .free = aic_irq_domain_free, + .translate = aic_irq_domain_translate, + .alloc = aic_irq_domain_alloc, + .free = aic_irq_domain_free, + .get_fwspec_info = aic_irq_get_fwspec_info, }; /* diff --git a/drivers/irqchip/irq-bcm2712-mip.c b/drivers/irqchip/irq-bcm2712-mip.c index 9bd7bc0bf6d5..4761974ad650 100644 --- a/drivers/irqchip/irq-bcm2712-mip.c +++ b/drivers/irqchip/irq-bcm2712-mip.c @@ -232,17 +232,12 @@ err_put: return ret; } -static int __init mip_of_msi_init(struct device_node *node, struct device_node *parent) +static int mip_msi_probe(struct platform_device *pdev, struct device_node *parent) { - struct platform_device *pdev; + struct device_node *node = pdev->dev.of_node; struct mip_priv *mip; int ret; - pdev = of_find_device_by_node(node); - of_node_put(node); - if (!pdev) - return -EPROBE_DEFER; - mip = kzalloc(sizeof(*mip), GFP_KERNEL); if (!mip) return -ENOMEM; @@ -285,7 +280,7 @@ err_priv: } IRQCHIP_PLATFORM_DRIVER_BEGIN(mip_msi) -IRQCHIP_MATCH("brcm,bcm2712-mip", mip_of_msi_init) +IRQCHIP_MATCH("brcm,bcm2712-mip", mip_msi_probe) IRQCHIP_PLATFORM_DRIVER_END(mip_msi) MODULE_DESCRIPTION("Broadcom BCM2712 MSI-X interrupt controller"); MODULE_AUTHOR("Phil Elwell <phil@raspberrypi.com>"); diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index 04fac0cc857f..ea1446c0a09c 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -82,12 +82,6 @@ static inline unsigned int reg_status(struct bcm7038_l1_chip *intc, return (0 * intc->n_words + word) * sizeof(u32); } -static inline unsigned int reg_mask_status(struct bcm7038_l1_chip *intc, - unsigned int word) -{ - return (1 * intc->n_words + word) * sizeof(u32); -} - static inline unsigned int reg_mask_set(struct bcm7038_l1_chip *intc, unsigned int word) { @@ -219,9 +213,8 @@ static int bcm7038_l1_set_affinity(struct irq_data *d, } #endif -static int __init bcm7038_l1_init_one(struct device_node *dn, - unsigned int idx, - struct bcm7038_l1_chip *intc) +static int bcm7038_l1_init_one(struct device_node *dn, unsigned int idx, + struct bcm7038_l1_chip *intc) { struct resource res; resource_size_t sz; @@ -395,9 +388,9 @@ static const struct irq_domain_ops bcm7038_l1_domain_ops = { .map = bcm7038_l1_map, }; -static int __init bcm7038_l1_of_init(struct device_node *dn, - struct device_node *parent) +static int bcm7038_l1_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *dn = pdev->dev.of_node; struct bcm7038_l1_chip *intc; int idx, ret; @@ -455,7 +448,7 @@ out_free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(bcm7038_l1) -IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_of_init) +IRQCHIP_MATCH("brcm,bcm7038-l1-intc", bcm7038_l1_probe) IRQCHIP_PLATFORM_DRIVER_END(bcm7038_l1) MODULE_DESCRIPTION("Broadcom STB 7038-style L1/L2 interrupt controller"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index ff22c3104401..518c9d4366a5 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -143,8 +143,7 @@ static int bcm7120_l2_intc_init_one(struct device_node *dn, return 0; } -static int __init bcm7120_l2_intc_iomap_7120(struct device_node *dn, - struct bcm7120_l2_intc_data *data) +static int bcm7120_l2_intc_iomap_7120(struct device_node *dn, struct bcm7120_l2_intc_data *data) { int ret; @@ -177,8 +176,7 @@ static int __init bcm7120_l2_intc_iomap_7120(struct device_node *dn, return 0; } -static int __init bcm7120_l2_intc_iomap_3380(struct device_node *dn, - struct bcm7120_l2_intc_data *data) +static int bcm7120_l2_intc_iomap_3380(struct device_node *dn, struct bcm7120_l2_intc_data *data) { unsigned int gc_idx; @@ -208,15 +206,14 @@ static int __init bcm7120_l2_intc_iomap_3380(struct device_node *dn, return 0; } -static int __init bcm7120_l2_intc_probe(struct device_node *dn, - struct device_node *parent, +static int bcm7120_l2_intc_probe(struct platform_device *pdev, struct device_node *parent, int (*iomap_regs_fn)(struct device_node *, - struct bcm7120_l2_intc_data *), + struct bcm7120_l2_intc_data *), const char *intc_name) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; + struct device_node *dn = pdev->dev.of_node; struct bcm7120_l2_intc_data *data; - struct platform_device *pdev; struct irq_chip_generic *gc; struct irq_chip_type *ct; int ret = 0; @@ -227,14 +224,7 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn, if (!data) return -ENOMEM; - pdev = of_find_device_by_node(dn); - if (!pdev) { - ret = -ENODEV; - goto out_free_data; - } - data->num_parent_irqs = platform_irq_count(pdev); - put_device(&pdev->dev); if (data->num_parent_irqs <= 0) { pr_err("invalid number of parent interrupts\n"); ret = -ENOMEM; @@ -334,22 +324,19 @@ out_unmap: if (data->map_base[idx]) iounmap(data->map_base[idx]); } -out_free_data: kfree(data); return ret; } -static int __init bcm7120_l2_intc_probe_7120(struct device_node *dn, - struct device_node *parent) +static int bcm7120_l2_intc_probe_7120(struct platform_device *pdev, struct device_node *parent) { - return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_7120, + return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_7120, "BCM7120 L2"); } -static int __init bcm7120_l2_intc_probe_3380(struct device_node *dn, - struct device_node *parent) +static int bcm7120_l2_intc_probe_3380(struct platform_device *pdev, struct device_node *parent) { - return bcm7120_l2_intc_probe(dn, parent, bcm7120_l2_intc_iomap_3380, + return bcm7120_l2_intc_probe(pdev, parent, bcm7120_l2_intc_iomap_3380, "BCM3380 L2"); } diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 1bec5b2cd3f0..bb7078d6524f 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -138,13 +138,12 @@ static void brcmstb_l2_intc_resume(struct irq_data *d) irq_reg_writel(gc, ~b->saved_mask, ct->regs.enable); } -static int __init brcmstb_l2_intc_of_init(struct device_node *np, - struct device_node *parent, - const struct brcmstb_intc_init_params - *init_params) +static int brcmstb_l2_intc_probe(struct platform_device *pdev, struct device_node *parent, + const struct brcmstb_intc_init_params *init_params) { unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN; unsigned int set = 0; + struct device_node *np = pdev->dev.of_node; struct brcmstb_l2_intc_data *data; struct irq_chip_type *ct; int ret; @@ -257,23 +256,21 @@ out_free: return ret; } -static int __init brcmstb_l2_edge_intc_of_init(struct device_node *np, - struct device_node *parent) +static int brcmstb_l2_edge_intc_probe(struct platform_device *pdev, struct device_node *parent) { - return brcmstb_l2_intc_of_init(np, parent, &l2_edge_intc_init); + return brcmstb_l2_intc_probe(pdev, parent, &l2_edge_intc_init); } -static int __init brcmstb_l2_lvl_intc_of_init(struct device_node *np, - struct device_node *parent) +static int brcmstb_l2_lvl_intc_probe(struct platform_device *pdev, struct device_node *parent) { - return brcmstb_l2_intc_of_init(np, parent, &l2_lvl_intc_init); + return brcmstb_l2_intc_probe(pdev, parent, &l2_lvl_intc_init); } IRQCHIP_PLATFORM_DRIVER_BEGIN(brcmstb_l2) -IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_of_init) -IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_of_init) +IRQCHIP_MATCH("brcm,l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,hif-spi-l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,upg-aux-aon-l2-intc", brcmstb_l2_edge_intc_probe) +IRQCHIP_MATCH("brcm,bcm7271-l2-intc", brcmstb_l2_lvl_intc_probe) IRQCHIP_PLATFORM_DRIVER_END(brcmstb_l2) MODULE_DESCRIPTION("Broadcom STB generic L2 interrupt controller"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-gic-its-msi-parent.c b/drivers/irqchip/irq-gic-its-msi-parent.c index eb1473f1448a..12f45228c867 100644 --- a/drivers/irqchip/irq-gic-its-msi-parent.c +++ b/drivers/irqchip/irq-gic-its-msi-parent.c @@ -142,83 +142,38 @@ static int its_v5_pci_msi_prepare(struct irq_domain *domain, struct device *dev, #define its_v5_pci_msi_prepare NULL #endif /* !CONFIG_PCI_MSI */ -static int of_pmsi_get_dev_id(struct irq_domain *domain, struct device *dev, - u32 *dev_id) +static int of_pmsi_get_msi_info(struct irq_domain *domain, struct device *dev, u32 *dev_id, + phys_addr_t *pa) { - int ret, index = 0; + struct of_phandle_iterator it; + int ret; /* Suck the DeviceID out of the msi-parent property */ - do { - struct of_phandle_args args; - - ret = of_parse_phandle_with_args(dev->of_node, - "msi-parent", "#msi-cells", - index, &args); - if (args.np == irq_domain_get_of_node(domain)) { - if (WARN_ON(args.args_count != 1)) - return -EINVAL; - *dev_id = args.args[0]; - break; - } - index++; - } while (!ret); - - if (ret) { - struct device_node *np = NULL; + of_for_each_phandle(&it, ret, dev->of_node, "msi-parent", "#msi-cells", -1) { + /* GICv5 ITS domain matches the MSI controller node parent */ + struct device_node *np __free(device_node) = pa ? of_get_parent(it.node) + : of_node_get(it.node); - ret = of_map_id(dev->of_node, dev->id, "msi-map", "msi-map-mask", &np, dev_id); - if (np) - of_node_put(np); - } + if (np == irq_domain_get_of_node(domain)) { + u32 args; - return ret; -} + if (WARN_ON(of_phandle_iterator_args(&it, &args, 1) != 1)) + ret = -EINVAL; -static int of_v5_pmsi_get_msi_info(struct irq_domain *domain, struct device *dev, - u32 *dev_id, phys_addr_t *pa) -{ - int ret, index = 0; - /* - * Retrieve the DeviceID and the ITS translate frame node pointer - * out of the msi-parent property. - */ - do { - struct of_phandle_args args; - - ret = of_parse_phandle_with_args(dev->of_node, - "msi-parent", "#msi-cells", - index, &args); - if (ret) - break; - /* - * The IRQ domain fwnode is the msi controller parent - * in GICv5 (where the msi controller nodes are the - * ITS translate frames). - */ - if (args.np->parent == irq_domain_get_of_node(domain)) { - if (WARN_ON(args.args_count != 1)) - return -EINVAL; - *dev_id = args.args[0]; - - ret = its_translate_frame_address(args.np, pa); - if (ret) - return -ENODEV; - break; - } - index++; - } while (!ret); + if (!ret && pa) + ret = its_translate_frame_address(it.node, pa); - if (ret) { - struct device_node *np = NULL; + if (!ret) + *dev_id = args; - ret = of_map_id(dev->of_node, dev->id, "msi-map", "msi-map-mask", &np, dev_id); - if (np) { - ret = its_translate_frame_address(np, pa); - of_node_put(np); + of_node_put(it.node); + return ret; } } - return ret; + struct device_node *msi_ctrl __free(device_node) = NULL; + + return of_map_id(dev->of_node, dev->id, "msi-map", "msi-map-mask", &msi_ctrl, dev_id); } int __weak iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id) @@ -234,7 +189,7 @@ static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev, int ret; if (dev->of_node) - ret = of_pmsi_get_dev_id(domain->parent, dev, &dev_id); + ret = of_pmsi_get_msi_info(domain->parent, dev, &dev_id, NULL); else ret = iort_pmsi_get_dev_id(dev, &dev_id); if (ret) @@ -262,7 +217,7 @@ static int its_v5_pmsi_prepare(struct irq_domain *domain, struct device *dev, if (!dev->of_node) return -ENODEV; - ret = of_v5_pmsi_get_msi_info(domain->parent, dev, &dev_id, &pa); + ret = of_pmsi_get_msi_info(domain->parent, dev, &dev_id, &pa); if (ret) return ret; diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 3de351e66ee8..6607ab58f72e 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -26,7 +26,6 @@ #include <linux/irqchip/arm-gic-common.h> #include <linux/irqchip/arm-gic-v3.h> #include <linux/irqchip/arm-gic-v3-prio.h> -#include <linux/irqchip/irq-partition-percpu.h> #include <linux/bitfield.h> #include <linux/bits.h> #include <linux/arm-smccc.h> @@ -46,8 +45,6 @@ static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI; #define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2) #define FLAGS_WORKAROUND_INSECURE (1ULL << 3) -#define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) - static struct cpumask broken_rdists __read_mostly __maybe_unused; struct redist_region { @@ -68,7 +65,13 @@ struct gic_chip_data { u64 flags; bool has_rss; unsigned int ppi_nr; - struct partition_desc **ppi_descs; + struct partition_affinity *parts; + unsigned int nr_parts; +}; + +struct partition_affinity { + cpumask_t mask; + struct fwnode_handle *partition_id; }; #define T241_CHIPS_MAX 4 @@ -228,9 +231,6 @@ static void __init gic_prio_init(void) !cpus_have_group0); } -/* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */ -static refcount_t *rdist_nmi_refs; - static struct gic_kvm_info gic_v3_kvm_info __initdata; static DEFINE_PER_CPU(bool, has_rss); @@ -594,36 +594,6 @@ static void gic_irq_set_prio(struct irq_data *d, u8 prio) writeb_relaxed(prio, base + offset + index); } -static u32 __gic_get_ppi_index(irq_hw_number_t hwirq) -{ - switch (__get_intid_range(hwirq)) { - case PPI_RANGE: - return hwirq - 16; - case EPPI_RANGE: - return hwirq - EPPI_BASE_INTID + 16; - default: - unreachable(); - } -} - -static u32 __gic_get_rdist_index(irq_hw_number_t hwirq) -{ - switch (__get_intid_range(hwirq)) { - case SGI_RANGE: - case PPI_RANGE: - return hwirq; - case EPPI_RANGE: - return hwirq - EPPI_BASE_INTID + 32; - default: - unreachable(); - } -} - -static u32 gic_get_rdist_index(struct irq_data *d) -{ - return __gic_get_rdist_index(d->hwirq); -} - static int gic_irq_nmi_setup(struct irq_data *d) { struct irq_desc *desc = irq_to_desc(d->irq); @@ -644,20 +614,8 @@ static int gic_irq_nmi_setup(struct irq_data *d) return -EINVAL; /* desc lock should already be held */ - if (gic_irq_in_rdist(d)) { - u32 idx = gic_get_rdist_index(d); - - /* - * Setting up a percpu interrupt as NMI, only switch handler - * for first NMI - */ - if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) { - refcount_set(&rdist_nmi_refs[idx], 1); - desc->handle_irq = handle_percpu_devid_fasteoi_nmi; - } - } else { + if (!gic_irq_in_rdist(d)) desc->handle_irq = handle_fasteoi_nmi; - } gic_irq_set_prio(d, dist_prio_nmi); @@ -684,15 +642,8 @@ static void gic_irq_nmi_teardown(struct irq_data *d) return; /* desc lock should already be held */ - if (gic_irq_in_rdist(d)) { - u32 idx = gic_get_rdist_index(d); - - /* Tearing down NMI, only switch handler for last NMI */ - if (refcount_dec_and_test(&rdist_nmi_refs[idx])) - desc->handle_irq = handle_percpu_devid_irq; - } else { + if (!gic_irq_in_rdist(d)) desc->handle_irq = handle_fasteoi_irq; - } gic_irq_set_prio(d, dist_prio_irq); } @@ -1666,13 +1617,6 @@ static int gic_irq_domain_translate(struct irq_domain *d, case GIC_IRQ_TYPE_LPI: /* LPI */ *hwirq = fwspec->param[1]; break; - case GIC_IRQ_TYPE_PARTITION: - *hwirq = fwspec->param[1]; - if (fwspec->param[1] >= 16) - *hwirq += EPPI_BASE_INTID - 16; - else - *hwirq += 16; - break; default: return -EINVAL; } @@ -1681,10 +1625,8 @@ static int gic_irq_domain_translate(struct irq_domain *d, /* * Make it clear that broken DTs are... broken. - * Partitioned PPIs are an unfortunate exception. */ - WARN_ON(*type == IRQ_TYPE_NONE && - fwspec->param[0] != GIC_IRQ_TYPE_PARTITION); + WARN_ON(*type == IRQ_TYPE_NONE); return 0; } @@ -1741,33 +1683,12 @@ static void gic_irq_domain_free(struct irq_domain *domain, unsigned int virq, } } -static bool fwspec_is_partitioned_ppi(struct irq_fwspec *fwspec, - irq_hw_number_t hwirq) -{ - enum gic_intid_range range; - - if (!gic_data.ppi_descs) - return false; - - if (!is_of_node(fwspec->fwnode)) - return false; - - if (fwspec->param_count < 4 || !fwspec->param[3]) - return false; - - range = __get_intid_range(hwirq); - if (range != PPI_RANGE && range != EPPI_RANGE) - return false; - - return true; -} - static int gic_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - unsigned int type, ppi_idx; irq_hw_number_t hwirq; + unsigned int type; int ret; /* Not for us */ @@ -1786,60 +1707,61 @@ static int gic_irq_domain_select(struct irq_domain *d, if (WARN_ON_ONCE(ret)) return 0; - if (!fwspec_is_partitioned_ppi(fwspec, hwirq)) - return d == gic_data.domain; - - /* - * If this is a PPI and we have a 4th (non-null) parameter, - * then we need to match the partition domain. - */ - ppi_idx = __gic_get_ppi_index(hwirq); - return d == partition_get_domain(gic_data.ppi_descs[ppi_idx]); + return d == gic_data.domain; } -static const struct irq_domain_ops gic_irq_domain_ops = { - .translate = gic_irq_domain_translate, - .alloc = gic_irq_domain_alloc, - .free = gic_irq_domain_free, - .select = gic_irq_domain_select, -}; - -static int partition_domain_translate(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *hwirq, - unsigned int *type) +static int gic_irq_get_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) { - unsigned long ppi_intid; - struct device_node *np; - unsigned int ppi_idx; - int ret; - - if (!gic_data.ppi_descs) - return -ENOMEM; + const struct cpumask *mask = NULL; - np = of_find_node_by_phandle(fwspec->param[3]); - if (WARN_ON(!np)) - return -EINVAL; + info->flags = 0; + info->affinity = NULL; - ret = gic_irq_domain_translate(d, fwspec, &ppi_intid, type); - if (WARN_ON_ONCE(ret)) + /* ACPI is not capable of describing PPI affinity -- yet */ + if (!is_of_node(fwspec->fwnode)) return 0; - ppi_idx = __gic_get_ppi_index(ppi_intid); - ret = partition_translate_id(gic_data.ppi_descs[ppi_idx], - of_fwnode_handle(np)); - if (ret < 0) - return ret; + /* If the specifier provides an affinity, use it */ + if (fwspec->param_count == 4 && fwspec->param[3]) { + struct fwnode_handle *fw; + + switch (fwspec->param[0]) { + case 1: /* PPI */ + case 3: /* EPPI */ + break; + default: + return 0; + } + + fw = of_fwnode_handle(of_find_node_by_phandle(fwspec->param[3])); + if (!fw) + return -ENOENT; + + for (int i = 0; i < gic_data.nr_parts; i++) { + if (gic_data.parts[i].partition_id == fw) { + mask = &gic_data.parts[i].mask; + break; + } + } + + if (!mask) + return -ENOENT; + } else { + mask = cpu_possible_mask; + } - *hwirq = ret; - *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + info->affinity = mask; + info->flags = IRQ_FWSPEC_INFO_AFFINITY_VALID; return 0; } -static const struct irq_domain_ops partition_domain_ops = { - .translate = partition_domain_translate, +static const struct irq_domain_ops gic_irq_domain_ops = { + .translate = gic_irq_domain_translate, + .alloc = gic_irq_domain_alloc, + .free = gic_irq_domain_free, .select = gic_irq_domain_select, + .get_fwspec_info = gic_irq_get_fwspec_info, }; static bool gic_enable_quirk_msm8996(void *data) @@ -2030,19 +1952,9 @@ static const struct gic_quirk gic_quirks[] = { static void gic_enable_nmi_support(void) { - int i; - if (!gic_prio_masking_enabled() || nmi_support_forbidden) return; - rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR, - sizeof(*rdist_nmi_refs), GFP_KERNEL); - if (!rdist_nmi_refs) - return; - - for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++) - refcount_set(&rdist_nmi_refs[i], 0); - pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n", gic_has_relaxed_pmr_sync() ? "relaxed" : "forced"); @@ -2174,12 +2086,7 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) if (!parts_node) return; - gic_data.ppi_descs = kcalloc(gic_data.ppi_nr, sizeof(*gic_data.ppi_descs), GFP_KERNEL); - if (!gic_data.ppi_descs) - goto out_put_node; - nr_parts = of_get_child_count(parts_node); - if (!nr_parts) goto out_put_node; @@ -2232,29 +2139,8 @@ static void __init gic_populate_ppi_partitions(struct device_node *gic_node) part_idx++; } - for (i = 0; i < gic_data.ppi_nr; i++) { - unsigned int irq; - struct partition_desc *desc; - struct irq_fwspec ppi_fwspec = { - .fwnode = gic_data.fwnode, - .param_count = 3, - .param = { - [0] = GIC_IRQ_TYPE_PARTITION, - [1] = i, - [2] = IRQ_TYPE_NONE, - }, - }; - - irq = irq_create_fwspec_mapping(&ppi_fwspec); - if (WARN_ON(!irq)) - continue; - desc = partition_create_desc(gic_data.fwnode, parts, nr_parts, - irq, &partition_domain_ops); - if (WARN_ON(!desc)) - continue; - - gic_data.ppi_descs[i] = desc; - } + gic_data.parts = parts; + gic_data.nr_parts = nr_parts; out_put_node: of_node_put(parts_node); diff --git a/drivers/irqchip/irq-imx-mu-msi.c b/drivers/irqchip/irq-imx-mu-msi.c index d2a4e8a61a42..c598f2f52fc6 100644 --- a/drivers/irqchip/irq-imx-mu-msi.c +++ b/drivers/irqchip/irq-imx-mu-msi.c @@ -296,11 +296,9 @@ static const struct imx_mu_dcfg imx_mu_cfg_imx8ulp = { }, }; -static int __init imx_mu_of_init(struct device_node *dn, - struct device_node *parent, - const struct imx_mu_dcfg *cfg) +static int imx_mu_probe(struct platform_device *pdev, struct device_node *parent, + const struct imx_mu_dcfg *cfg) { - struct platform_device *pdev = of_find_device_by_node(dn); struct device_link *pd_link_a; struct device_link *pd_link_b; struct imx_mu_msi *msi_data; @@ -416,31 +414,27 @@ static const struct dev_pm_ops imx_mu_pm_ops = { imx_mu_runtime_resume, NULL) }; -static int __init imx_mu_imx7ulp_of_init(struct device_node *dn, - struct device_node *parent) +static int imx_mu_imx7ulp_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx7ulp); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx7ulp); } -static int __init imx_mu_imx6sx_of_init(struct device_node *dn, - struct device_node *parent) +static int imx_mu_imx6sx_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx6sx); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx6sx); } -static int __init imx_mu_imx8ulp_of_init(struct device_node *dn, - struct device_node *parent) +static int imx_mu_imx8ulp_probe(struct platform_device *pdev, struct device_node *parent) { - return imx_mu_of_init(dn, parent, &imx_mu_cfg_imx8ulp); + return imx_mu_probe(pdev, parent, &imx_mu_cfg_imx8ulp); } IRQCHIP_PLATFORM_DRIVER_BEGIN(imx_mu_msi) -IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_of_init) -IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_of_init) -IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_of_init) +IRQCHIP_MATCH("fsl,imx7ulp-mu-msi", imx_mu_imx7ulp_probe) +IRQCHIP_MATCH("fsl,imx6sx-mu-msi", imx_mu_imx6sx_probe) +IRQCHIP_MATCH("fsl,imx8ulp-mu-msi", imx_mu_imx8ulp_probe) IRQCHIP_PLATFORM_DRIVER_END(imx_mu_msi, .pm = &imx_mu_pm_ops) - MODULE_AUTHOR("Frank Li <Frank.Li@nxp.com>"); MODULE_DESCRIPTION("Freescale MU MSI controller driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/irqchip/irq-mchp-eic.c b/drivers/irqchip/irq-mchp-eic.c index 516a3a0e359c..b513a899c085 100644 --- a/drivers/irqchip/irq-mchp-eic.c +++ b/drivers/irqchip/irq-mchp-eic.c @@ -199,8 +199,9 @@ static const struct irq_domain_ops mchp_eic_domain_ops = { .free = irq_domain_free_irqs_common, }; -static int mchp_eic_init(struct device_node *node, struct device_node *parent) +static int mchp_eic_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *node = pdev->dev.of_node; struct irq_domain *parent_domain = NULL; int ret, i; @@ -273,7 +274,7 @@ free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(mchp_eic) -IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_init) +IRQCHIP_MATCH("microchip,sama7g5-eic", mchp_eic_probe) IRQCHIP_PLATFORM_DRIVER_END(mchp_eic) MODULE_DESCRIPTION("Microchip External Interrupt Controller"); diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index 7d177626d64b..3fcbb044ae60 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -174,6 +174,14 @@ static const struct meson_gpio_irq_params s4_params = { INIT_MESON_S4_COMMON_DATA(82) }; +static const struct meson_gpio_irq_params s6_params = { + INIT_MESON_S4_COMMON_DATA(100) +}; + +static const struct meson_gpio_irq_params s7_params = { + INIT_MESON_S4_COMMON_DATA(84) +}; + static const struct meson_gpio_irq_params c3_params = { INIT_MESON_S4_COMMON_DATA(55) }; @@ -195,6 +203,9 @@ static const struct of_device_id meson_irq_gpio_matches[] __maybe_unused = { { .compatible = "amlogic,a4-gpio-ao-intc", .data = &a4_ao_params }, { .compatible = "amlogic,a4-gpio-intc", .data = &a4_params }, { .compatible = "amlogic,a5-gpio-intc", .data = &a5_params }, + { .compatible = "amlogic,s6-gpio-intc", .data = &s6_params }, + { .compatible = "amlogic,s7-gpio-intc", .data = &s7_params }, + { .compatible = "amlogic,s7d-gpio-intc", .data = &s7_params }, { .compatible = "amlogic,c3-gpio-intc", .data = &c3_params }, { .compatible = "amlogic,t7-gpio-intc", .data = &t7_params }, { } @@ -572,8 +583,9 @@ static int meson_gpio_irq_parse_dt(struct device_node *node, struct meson_gpio_i return 0; } -static int meson_gpio_irq_of_init(struct device_node *node, struct device_node *parent) +static int meson_gpio_irq_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *node = pdev->dev.of_node; struct irq_domain *domain, *parent_domain; struct meson_gpio_irq_controller *ctl; int ret; @@ -630,10 +642,9 @@ free_ctl: } IRQCHIP_PLATFORM_DRIVER_BEGIN(meson_gpio_intc) -IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_of_init) +IRQCHIP_MATCH("amlogic,meson-gpio-intc", meson_gpio_irq_probe) IRQCHIP_PLATFORM_DRIVER_END(meson_gpio_intc) MODULE_AUTHOR("Jerome Brunet <jbrunet@baylibre.com>"); MODULE_DESCRIPTION("Meson GPIO Interrupt Multiplexer driver"); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:meson-gpio-intc"); diff --git a/drivers/irqchip/irq-mvebu-pic.c b/drivers/irqchip/irq-mvebu-pic.c index cd8b73482b9f..10b85128183a 100644 --- a/drivers/irqchip/irq-mvebu-pic.c +++ b/drivers/irqchip/irq-mvebu-pic.c @@ -195,5 +195,3 @@ MODULE_AUTHOR("Yehuda Yitschak <yehuday@marvell.com>"); MODULE_AUTHOR("Thomas Petazzoni <thomas.petazzoni@free-electrons.com>"); MODULE_DESCRIPTION("Marvell Armada 7K/8K PIC driver"); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:mvebu_pic"); - diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c deleted file mode 100644 index 4441ffe149ea..000000000000 --- a/drivers/irqchip/irq-partition-percpu.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <linux/bitops.h> -#include <linux/interrupt.h> -#include <linux/irqchip.h> -#include <linux/irqchip/chained_irq.h> -#include <linux/irqchip/irq-partition-percpu.h> -#include <linux/irqdomain.h> -#include <linux/seq_file.h> -#include <linux/slab.h> - -struct partition_desc { - int nr_parts; - struct partition_affinity *parts; - struct irq_domain *domain; - struct irq_desc *chained_desc; - unsigned long *bitmap; - struct irq_domain_ops ops; -}; - -static bool partition_check_cpu(struct partition_desc *part, - unsigned int cpu, unsigned int hwirq) -{ - return cpumask_test_cpu(cpu, &part->parts[hwirq].mask); -} - -static void partition_irq_mask(struct irq_data *d) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_mask) - chip->irq_mask(data); -} - -static void partition_irq_unmask(struct irq_data *d) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_unmask) - chip->irq_unmask(data); -} - -static int partition_irq_set_irqchip_state(struct irq_data *d, - enum irqchip_irq_state which, - bool val) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_set_irqchip_state) - return chip->irq_set_irqchip_state(data, which, val); - - return -EINVAL; -} - -static int partition_irq_get_irqchip_state(struct irq_data *d, - enum irqchip_irq_state which, - bool *val) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (partition_check_cpu(part, smp_processor_id(), d->hwirq) && - chip->irq_get_irqchip_state) - return chip->irq_get_irqchip_state(data, which, val); - - return -EINVAL; -} - -static int partition_irq_set_type(struct irq_data *d, unsigned int type) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - if (chip->irq_set_type) - return chip->irq_set_type(data, type); - - return -EINVAL; -} - -static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p) -{ - struct partition_desc *part = irq_data_get_irq_chip_data(d); - struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); - struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - - seq_printf(p, "%5s-%lu", chip->name, data->hwirq); -} - -static struct irq_chip partition_irq_chip = { - .irq_mask = partition_irq_mask, - .irq_unmask = partition_irq_unmask, - .irq_set_type = partition_irq_set_type, - .irq_get_irqchip_state = partition_irq_get_irqchip_state, - .irq_set_irqchip_state = partition_irq_set_irqchip_state, - .irq_print_chip = partition_irq_print_chip, -}; - -static void partition_handle_irq(struct irq_desc *desc) -{ - struct partition_desc *part = irq_desc_get_handler_data(desc); - struct irq_chip *chip = irq_desc_get_chip(desc); - int cpu = smp_processor_id(); - int hwirq; - - chained_irq_enter(chip, desc); - - for_each_set_bit(hwirq, part->bitmap, part->nr_parts) { - if (partition_check_cpu(part, cpu, hwirq)) - break; - } - - if (unlikely(hwirq == part->nr_parts)) - handle_bad_irq(desc); - else - generic_handle_domain_irq(part->domain, hwirq); - - chained_irq_exit(chip, desc); -} - -static int partition_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - int ret; - irq_hw_number_t hwirq; - unsigned int type; - struct irq_fwspec *fwspec = arg; - struct partition_desc *part; - - BUG_ON(nr_irqs != 1); - ret = domain->ops->translate(domain, fwspec, &hwirq, &type); - if (ret) - return ret; - - part = domain->host_data; - - set_bit(hwirq, part->bitmap); - irq_set_chained_handler_and_data(irq_desc_get_irq(part->chained_desc), - partition_handle_irq, part); - irq_set_percpu_devid_partition(virq, &part->parts[hwirq].mask); - irq_domain_set_info(domain, virq, hwirq, &partition_irq_chip, part, - handle_percpu_devid_irq, NULL, NULL); - irq_set_status_flags(virq, IRQ_NOAUTOEN); - - return 0; -} - -static void partition_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) -{ - struct irq_data *d; - - BUG_ON(nr_irqs != 1); - - d = irq_domain_get_irq_data(domain, virq); - irq_set_handler(virq, NULL); - irq_domain_reset_irq_data(d); -} - -int partition_translate_id(struct partition_desc *desc, void *partition_id) -{ - struct partition_affinity *part = NULL; - int i; - - for (i = 0; i < desc->nr_parts; i++) { - if (desc->parts[i].partition_id == partition_id) { - part = &desc->parts[i]; - break; - } - } - - if (WARN_ON(!part)) { - pr_err("Failed to find partition\n"); - return -EINVAL; - } - - return i; -} - -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - struct partition_desc *desc; - struct irq_domain *d; - - BUG_ON(!ops->select || !ops->translate); - - desc = kzalloc(sizeof(*desc), GFP_KERNEL); - if (!desc) - return NULL; - - desc->ops = *ops; - desc->ops.free = partition_domain_free; - desc->ops.alloc = partition_domain_alloc; - - d = irq_domain_create_linear(fwnode, nr_parts, &desc->ops, desc); - if (!d) - goto out; - desc->domain = d; - - desc->bitmap = bitmap_zalloc(nr_parts, GFP_KERNEL); - if (WARN_ON(!desc->bitmap)) - goto out; - - desc->chained_desc = irq_to_desc(chained_irq); - desc->nr_parts = nr_parts; - desc->parts = parts; - - return desc; -out: - if (d) - irq_domain_remove(d); - kfree(desc); - - return NULL; -} - -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - if (dsc) - return dsc->domain; - - return NULL; -} diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c index 8d569f7c5a7a..83f31ea657b7 100644 --- a/drivers/irqchip/irq-qcom-mpm.c +++ b/drivers/irqchip/irq-qcom-mpm.c @@ -320,9 +320,9 @@ static bool gic_hwirq_is_mapped(struct mpm_gic_map *maps, int cnt, u32 hwirq) return false; } -static int qcom_mpm_init(struct device_node *np, struct device_node *parent) +static int qcom_mpm_probe(struct platform_device *pdev, struct device_node *parent) { - struct platform_device *pdev = of_find_device_by_node(np); + struct device_node *np = pdev->dev.of_node; struct device *dev = &pdev->dev; struct irq_domain *parent_domain; struct generic_pm_domain *genpd; @@ -478,7 +478,7 @@ remove_genpd: } IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_mpm) -IRQCHIP_MATCH("qcom,mpm", qcom_mpm_init) +IRQCHIP_MATCH("qcom,mpm", qcom_mpm_probe) IRQCHIP_PLATFORM_DRIVER_END(qcom_mpm) MODULE_DESCRIPTION("Qualcomm Technologies, Inc. MSM Power Manager"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 2a54adeb4cc7..1bf19deb02c4 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -8,7 +8,6 @@ */ #include <linux/bitfield.h> -#include <linux/cleanup.h> #include <linux/clk.h> #include <linux/err.h> #include <linux/io.h> @@ -528,18 +527,15 @@ static int rzg2l_irqc_parse_interrupts(struct rzg2l_irqc_priv *priv, return 0; } -static int rzg2l_irqc_common_init(struct device_node *node, struct device_node *parent, - const struct irq_chip *irq_chip) +static int rzg2l_irqc_common_probe(struct platform_device *pdev, struct device_node *parent, + const struct irq_chip *irq_chip) { - struct platform_device *pdev = of_find_device_by_node(node); - struct device *dev __free(put_device) = pdev ? &pdev->dev : NULL; struct irq_domain *irq_domain, *parent_domain; + struct device_node *node = pdev->dev.of_node; + struct device *dev = &pdev->dev; struct reset_control *resetn; int ret; - if (!pdev) - return -ENODEV; - parent_domain = irq_find_host(parent); if (!parent_domain) return dev_err_probe(dev, -ENODEV, "cannot find parent domain\n"); @@ -583,35 +579,22 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * register_syscore_ops(&rzg2l_irqc_syscore_ops); - /* - * Prevent the cleanup function from invoking put_device by assigning - * NULL to dev. - * - * make coccicheck will complain about missing put_device calls, but - * those are false positives, as dev will be automatically "put" via - * __free_put_device on the failing path. - * On the successful path we don't actually want to "put" dev. - */ - dev = NULL; - return 0; } -static int __init rzg2l_irqc_init(struct device_node *node, - struct device_node *parent) +static int rzg2l_irqc_probe(struct platform_device *pdev, struct device_node *parent) { - return rzg2l_irqc_common_init(node, parent, &rzg2l_irqc_chip); + return rzg2l_irqc_common_probe(pdev, parent, &rzg2l_irqc_chip); } -static int __init rzfive_irqc_init(struct device_node *node, - struct device_node *parent) +static int rzfive_irqc_probe(struct platform_device *pdev, struct device_node *parent) { - return rzg2l_irqc_common_init(node, parent, &rzfive_irqc_chip); + return rzg2l_irqc_common_probe(pdev, parent, &rzfive_irqc_chip); } IRQCHIP_PLATFORM_DRIVER_BEGIN(rzg2l_irqc) -IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_init) -IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_init) +IRQCHIP_MATCH("renesas,rzg2l-irqc", rzg2l_irqc_probe) +IRQCHIP_MATCH("renesas,r9a07g043f-irqc", rzfive_irqc_probe) IRQCHIP_PLATFORM_DRIVER_END(rzg2l_irqc) MODULE_AUTHOR("Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>"); MODULE_DESCRIPTION("Renesas RZ/G2L IRQC Driver"); diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index 9018d9c3911e..899a423b5da8 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -490,29 +490,15 @@ static int rzv2h_icu_parse_interrupts(struct rzv2h_icu_priv *priv, struct device return 0; } -static void rzv2h_icu_put_device(void *data) -{ - put_device(data); -} - -static int rzv2h_icu_init_common(struct device_node *node, struct device_node *parent, - const struct rzv2h_hw_info *hw_info) +static int rzv2h_icu_probe_common(struct platform_device *pdev, struct device_node *parent, + const struct rzv2h_hw_info *hw_info) { struct irq_domain *irq_domain, *parent_domain; + struct device_node *node = pdev->dev.of_node; struct rzv2h_icu_priv *rzv2h_icu_data; - struct platform_device *pdev; struct reset_control *resetn; int ret; - pdev = of_find_device_by_node(node); - if (!pdev) - return -ENODEV; - - ret = devm_add_action_or_reset(&pdev->dev, rzv2h_icu_put_device, - &pdev->dev); - if (ret < 0) - return ret; - parent_domain = irq_find_host(parent); if (!parent_domain) { dev_err(&pdev->dev, "cannot find parent domain\n"); @@ -618,19 +604,19 @@ static const struct rzv2h_hw_info rzv2h_hw_params = { .field_width = 8, }; -static int rzg3e_icu_init(struct device_node *node, struct device_node *parent) +static int rzg3e_icu_probe(struct platform_device *pdev, struct device_node *parent) { - return rzv2h_icu_init_common(node, parent, &rzg3e_hw_params); + return rzv2h_icu_probe_common(pdev, parent, &rzg3e_hw_params); } -static int rzv2h_icu_init(struct device_node *node, struct device_node *parent) +static int rzv2h_icu_probe(struct platform_device *pdev, struct device_node *parent) { - return rzv2h_icu_init_common(node, parent, &rzv2h_hw_params); + return rzv2h_icu_probe_common(pdev, parent, &rzv2h_hw_params); } IRQCHIP_PLATFORM_DRIVER_BEGIN(rzv2h_icu) -IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_init) -IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_init) +IRQCHIP_MATCH("renesas,r9a09g047-icu", rzg3e_icu_probe) +IRQCHIP_MATCH("renesas,r9a09g057-icu", rzv2h_icu_probe) IRQCHIP_PLATFORM_DRIVER_END(rzv2h_icu) MODULE_AUTHOR("Fabrizio Castro <fabrizio.castro.jz@renesas.com>"); MODULE_DESCRIPTION("Renesas RZ/V2H(P) ICU Driver"); diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index 2c4c682627b8..6bac67cc0b6d 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -91,9 +91,8 @@ static int __init imsic_ipi_domain_init(void) { return 0; } */ static void imsic_handle_irq(struct irq_desc *desc) { + struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv); struct irq_chip *chip = irq_desc_get_chip(desc); - int cpu = smp_processor_id(); - struct imsic_vector *vec; unsigned long local_id; /* @@ -113,16 +112,12 @@ static void imsic_handle_irq(struct irq_desc *desc) continue; } - if (unlikely(!imsic->base_domain)) - continue; - - vec = imsic_vector_from_local_id(cpu, local_id); - if (!vec) { + if (unlikely(local_id > imsic->global.nr_ids)) { pr_warn_ratelimited("vector not found for local ID 0x%lx\n", local_id); continue; } - generic_handle_irq(vec->irq); + generic_handle_irq(lpriv->vectors[local_id].irq); } chained_irq_exit(chip, desc); diff --git a/drivers/irqchip/irq-riscv-imsic-platform.c b/drivers/irqchip/irq-riscv-imsic-platform.c index 643c8e459611..7228a33f6c37 100644 --- a/drivers/irqchip/irq-riscv-imsic-platform.c +++ b/drivers/irqchip/irq-riscv-imsic-platform.c @@ -158,11 +158,11 @@ static int imsic_irq_set_affinity(struct irq_data *d, const struct cpumask *mask tmp_vec.local_id = new_vec->local_id; /* Point device to the temporary vector */ - imsic_msi_update_msg(irq_get_irq_data(d->irq), &tmp_vec); + imsic_msi_update_msg(d, &tmp_vec); } /* Point device to the new vector */ - imsic_msi_update_msg(irq_get_irq_data(d->irq), new_vec); + imsic_msi_update_msg(d, new_vec); /* Update irq descriptors with the new vector */ d->chip_data = new_vec; diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c index dc95ad856d80..385368052d5c 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.c +++ b/drivers/irqchip/irq-riscv-imsic-state.c @@ -434,16 +434,6 @@ void imsic_vector_debug_show_summary(struct seq_file *m, int ind) } #endif -struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id) -{ - struct imsic_local_priv *lpriv = per_cpu_ptr(imsic->lpriv, cpu); - - if (!lpriv || imsic->global.nr_ids < local_id) - return NULL; - - return &lpriv->vectors[local_id]; -} - struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask) { struct imsic_vector *vec = NULL; @@ -487,7 +477,6 @@ static void __init imsic_local_cleanup(void) lpriv = per_cpu_ptr(imsic->lpriv, cpu); bitmap_free(lpriv->dirty_bitmap); - kfree(lpriv->vectors); } free_percpu(imsic->lpriv); @@ -501,7 +490,8 @@ static int __init imsic_local_init(void) int cpu, i; /* Allocate per-CPU private state */ - imsic->lpriv = alloc_percpu(typeof(*imsic->lpriv)); + imsic->lpriv = __alloc_percpu(struct_size(imsic->lpriv, vectors, global->nr_ids + 1), + __alignof__(*imsic->lpriv)); if (!imsic->lpriv) return -ENOMEM; @@ -521,12 +511,6 @@ static int __init imsic_local_init(void) timer_setup(&lpriv->timer, imsic_local_timer_callback, TIMER_PINNED); #endif - /* Allocate vector array */ - lpriv->vectors = kcalloc(global->nr_ids + 1, sizeof(*lpriv->vectors), - GFP_KERNEL); - if (!lpriv->vectors) - goto fail_local_cleanup; - /* Setup vector array */ for (i = 0; i <= global->nr_ids; i++) { vec = &lpriv->vectors[i]; diff --git a/drivers/irqchip/irq-riscv-imsic-state.h b/drivers/irqchip/irq-riscv-imsic-state.h index 57f951952b0c..6332501dcbd8 100644 --- a/drivers/irqchip/irq-riscv-imsic-state.h +++ b/drivers/irqchip/irq-riscv-imsic-state.h @@ -40,7 +40,7 @@ struct imsic_local_priv { #endif /* Local vector table */ - struct imsic_vector *vectors; + struct imsic_vector vectors[]; }; struct imsic_priv { @@ -95,8 +95,6 @@ static inline struct imsic_vector *imsic_vector_get_move(struct imsic_vector *ve void imsic_vector_force_move_cleanup(struct imsic_vector *vec); void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec); -struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id); - struct imsic_vector *imsic_vector_alloc(unsigned int irq, const struct cpumask *mask); void imsic_vector_free(struct imsic_vector *vector); diff --git a/drivers/irqchip/irq-riscv-intc.c b/drivers/irqchip/irq-riscv-intc.c index e5805885394e..70290b35b317 100644 --- a/drivers/irqchip/irq-riscv-intc.c +++ b/drivers/irqchip/irq-riscv-intc.c @@ -166,7 +166,8 @@ static int riscv_intc_domain_alloc(struct irq_domain *domain, static const struct irq_domain_ops riscv_intc_domain_ops = { .map = riscv_intc_domain_map, .xlate = irq_domain_xlate_onecell, - .alloc = riscv_intc_domain_alloc + .alloc = riscv_intc_domain_alloc, + .free = irq_domain_free_irqs_top, }; static struct fwnode_handle *riscv_intc_hwnode(void) diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c index cbd7697bc148..c5db7d6e3f7c 100644 --- a/drivers/irqchip/irq-sifive-plic.c +++ b/drivers/irqchip/irq-sifive-plic.c @@ -49,6 +49,8 @@ #define CONTEXT_ENABLE_BASE 0x2000 #define CONTEXT_ENABLE_SIZE 0x80 +#define PENDING_BASE 0x1000 + /* * Each hart context has a set of control registers associated with it. Right * now there's only two: a source priority threshold over which the hart will @@ -63,6 +65,7 @@ #define PLIC_ENABLE_THRESHOLD 0 #define PLIC_QUIRK_EDGE_INTERRUPT 0 +#define PLIC_QUIRK_CP100_CLAIM_REGISTER_ERRATUM 1 struct plic_priv { struct fwnode_handle *fwnode; @@ -94,15 +97,22 @@ static DEFINE_PER_CPU(struct plic_handler, plic_handlers); static int plic_irq_set_type(struct irq_data *d, unsigned int type); -static void __plic_toggle(void __iomem *enable_base, int hwirq, int enable) +static void __plic_toggle(struct plic_handler *handler, int hwirq, int enable) { - u32 __iomem *reg = enable_base + (hwirq / 32) * sizeof(u32); + u32 __iomem *base = handler->enable_base; u32 hwirq_mask = 1 << (hwirq % 32); + int group = hwirq / 32; + u32 value; + + value = readl(base + group); if (enable) - writel(readl(reg) | hwirq_mask, reg); + value |= hwirq_mask; else - writel(readl(reg) & ~hwirq_mask, reg); + value &= ~hwirq_mask; + + handler->enable_save[group] = value; + writel(value, base + group); } static void plic_toggle(struct plic_handler *handler, int hwirq, int enable) @@ -110,7 +120,7 @@ static void plic_toggle(struct plic_handler *handler, int hwirq, int enable) unsigned long flags; raw_spin_lock_irqsave(&handler->enable_lock, flags); - __plic_toggle(handler->enable_base, hwirq, enable); + __plic_toggle(handler, hwirq, enable); raw_spin_unlock_irqrestore(&handler->enable_lock, flags); } @@ -247,33 +257,16 @@ static int plic_irq_set_type(struct irq_data *d, unsigned int type) static int plic_irq_suspend(void) { - unsigned int i, cpu; - unsigned long flags; - u32 __iomem *reg; struct plic_priv *priv; priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; /* irq ID 0 is reserved */ - for (i = 1; i < priv->nr_irqs; i++) { + for (unsigned int i = 1; i < priv->nr_irqs; i++) { __assign_bit(i, priv->prio_save, readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); } - for_each_present_cpu(cpu) { - struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); - - if (!handler->present) - continue; - - raw_spin_lock_irqsave(&handler->enable_lock, flags); - for (i = 0; i < DIV_ROUND_UP(priv->nr_irqs, 32); i++) { - reg = handler->enable_base + i * sizeof(u32); - handler->enable_save[i] = readl(reg); - } - raw_spin_unlock_irqrestore(&handler->enable_lock, flags); - } - return 0; } @@ -398,6 +391,98 @@ static void plic_handle_irq(struct irq_desc *desc) chained_irq_exit(chip, desc); } +static u32 cp100_isolate_pending_irq(int nr_irq_groups, struct plic_handler *handler) +{ + u32 __iomem *pending = handler->priv->regs + PENDING_BASE; + u32 __iomem *enable = handler->enable_base; + u32 pending_irqs = 0; + int i, j; + + /* Look for first pending interrupt */ + for (i = 0; i < nr_irq_groups; i++) { + /* Any pending interrupts would be annihilated, so skip checking them */ + if (!handler->enable_save[i]) + continue; + + pending_irqs = handler->enable_save[i] & readl_relaxed(pending + i); + if (pending_irqs) + break; + } + + if (!pending_irqs) + return 0; + + /* Isolate lowest set bit */ + pending_irqs &= -pending_irqs; + + /* Disable all interrupts but the first pending one */ + for (j = 0; j < nr_irq_groups; j++) { + u32 new_mask = j == i ? pending_irqs : 0; + + if (new_mask != handler->enable_save[j]) + writel_relaxed(new_mask, enable + j); + } + return pending_irqs; +} + +static irq_hw_number_t cp100_get_hwirq(struct plic_handler *handler, void __iomem *claim) +{ + int nr_irq_groups = DIV_ROUND_UP(handler->priv->nr_irqs, 32); + u32 __iomem *enable = handler->enable_base; + irq_hw_number_t hwirq = 0; + u32 iso_mask; + int i; + + guard(raw_spinlock)(&handler->enable_lock); + + /* Existing enable state is already cached in enable_save */ + iso_mask = cp100_isolate_pending_irq(nr_irq_groups, handler); + if (!iso_mask) + return 0; + + /* + * Interrupts delievered to hardware still become pending, but only + * interrupts that are both pending and enabled can be claimed. + * Clearing the enable bit for all interrupts but the first pending + * one avoids a hardware bug that occurs during read from the claim + * register with more than one eligible interrupt. + */ + hwirq = readl(claim); + + /* Restore previous state */ + for (i = 0; i < nr_irq_groups; i++) { + u32 written = i == hwirq / 32 ? iso_mask : 0; + u32 stored = handler->enable_save[i]; + + if (stored != written) + writel_relaxed(stored, enable + i); + } + return hwirq; +} + +static void plic_handle_irq_cp100(struct irq_desc *desc) +{ + struct plic_handler *handler = this_cpu_ptr(&plic_handlers); + struct irq_chip *chip = irq_desc_get_chip(desc); + void __iomem *claim = handler->hart_base + CONTEXT_CLAIM; + irq_hw_number_t hwirq; + + WARN_ON_ONCE(!handler->present); + + chained_irq_enter(chip, desc); + + while ((hwirq = cp100_get_hwirq(handler, claim))) { + int err = generic_handle_domain_irq(handler->priv->irqdomain, hwirq); + + if (unlikely(err)) { + pr_warn_ratelimited("%pfwP: can't find mapping for hwirq %lu\n", + handler->priv->fwnode, hwirq); + } + } + + chained_irq_exit(chip, desc); +} + static void plic_set_threshold(struct plic_handler *handler, u32 threshold) { /* priority must be > threshold to trigger an interrupt */ @@ -434,6 +519,8 @@ static const struct of_device_id plic_match[] = { .data = (const void *)BIT(PLIC_QUIRK_EDGE_INTERRUPT) }, { .compatible = "thead,c900-plic", .data = (const void *)BIT(PLIC_QUIRK_EDGE_INTERRUPT) }, + { .compatible = "ultrarisc,cp100-plic", + .data = (const void *)BIT(PLIC_QUIRK_CP100_CLAIM_REGISTER_ERRATUM) }, {} }; @@ -592,12 +679,11 @@ static int plic_probe(struct fwnode_handle *fwnode) if (parent_hwirq != RV_IRQ_EXT) { /* Disable S-mode enable bits if running in M-mode. */ if (IS_ENABLED(CONFIG_RISCV_M_MODE)) { - void __iomem *enable_base = priv->regs + - CONTEXT_ENABLE_BASE + - i * CONTEXT_ENABLE_SIZE; + u32 __iomem *enable_base = priv->regs + CONTEXT_ENABLE_BASE + + i * CONTEXT_ENABLE_SIZE; - for (hwirq = 1; hwirq <= nr_irqs; hwirq++) - __plic_toggle(enable_base, hwirq, 0); + for (int j = 0; j <= nr_irqs / 32; j++) + writel(0, enable_base + j); } continue; } @@ -668,12 +754,17 @@ done: } if (global_setup) { + void (*handler_fn)(struct irq_desc *) = plic_handle_irq; + + if (test_bit(PLIC_QUIRK_CP100_CLAIM_REGISTER_ERRATUM, &handler->priv->plic_quirks)) + handler_fn = plic_handle_irq_cp100; + /* Find parent domain and register chained handler */ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY); if (domain) plic_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT); if (plic_parent_irq) - irq_set_chained_handler(plic_parent_irq, plic_handle_irq); + irq_set_chained_handler(plic_parent_irq, handler_fn); cpuhp_setup_state(CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, "irqchip/sifive/plic:starting", diff --git a/drivers/irqchip/irq-starfive-jh8100-intc.c b/drivers/irqchip/irq-starfive-jh8100-intc.c index 2460798ec158..705361b4ebe0 100644 --- a/drivers/irqchip/irq-starfive-jh8100-intc.c +++ b/drivers/irqchip/irq-starfive-jh8100-intc.c @@ -114,9 +114,9 @@ static void starfive_intc_irq_handler(struct irq_desc *desc) chained_irq_exit(chip, desc); } -static int __init starfive_intc_init(struct device_node *intc, - struct device_node *parent) +static int starfive_intc_probe(struct platform_device *pdev, struct device_node *parent) { + struct device_node *intc = pdev->dev.of_node; struct starfive_irq_chip *irqc; struct reset_control *rst; struct clk *clk; @@ -199,7 +199,7 @@ err_free: } IRQCHIP_PLATFORM_DRIVER_BEGIN(starfive_intc) -IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_init) +IRQCHIP_MATCH("starfive,jh8100-intc", starfive_intc_probe) IRQCHIP_PLATFORM_DRIVER_END(starfive_intc) MODULE_DESCRIPTION("StarFive JH8100 External Interrupt Controller"); diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c index 1e236d5b7516..2e4013c6834d 100644 --- a/drivers/irqchip/irq-ts4800.c +++ b/drivers/irqchip/irq-ts4800.c @@ -165,4 +165,3 @@ module_platform_driver(ts4800_ic_driver); MODULE_AUTHOR("Damien Riegel <damien.riegel@savoirfairelinux.com>"); MODULE_DESCRIPTION("Multiplexed-IRQs driver for TS-4800's FPGA"); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:ts4800_irqc"); diff --git a/drivers/irqchip/irqchip.c b/drivers/irqchip/irqchip.c index 0ee7b6b71f5f..689c8e448901 100644 --- a/drivers/irqchip/irqchip.c +++ b/drivers/irqchip/irqchip.c @@ -36,11 +36,10 @@ int platform_irqchip_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *par_np __free(device_node) = of_irq_find_parent(np); - of_irq_init_cb_t irq_init_cb = of_device_get_match_data(&pdev->dev); + platform_irq_probe_t irq_probe = of_device_get_match_data(&pdev->dev); - if (!irq_init_cb) { + if (!irq_probe) return -EINVAL; - } if (par_np == np) par_np = NULL; @@ -53,10 +52,9 @@ int platform_irqchip_probe(struct platform_device *pdev) * interrupt controller. The actual initialization callback of this * interrupt controller can check for specific domains as necessary. */ - if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) { + if (par_np && !irq_find_matching_host(par_np, DOMAIN_BUS_ANY)) return -EPROBE_DEFER; - } - return irq_init_cb(np, par_np); + return irq_probe(pdev, par_np); } EXPORT_SYMBOL_GPL(platform_irqchip_probe); diff --git a/drivers/irqchip/qcom-irq-combiner.c b/drivers/irqchip/qcom-irq-combiner.c index 18e696dc7f4d..09819007d08e 100644 --- a/drivers/irqchip/qcom-irq-combiner.c +++ b/drivers/irqchip/qcom-irq-combiner.c @@ -222,7 +222,7 @@ static int get_registers(struct platform_device *pdev, struct combiner *comb) return 0; } -static int __init combiner_probe(struct platform_device *pdev) +static int combiner_probe(struct platform_device *pdev) { struct combiner *combiner; int nregs; @@ -266,11 +266,11 @@ static const struct acpi_device_id qcom_irq_combiner_ids[] = { { } }; -static struct platform_driver qcom_irq_combiner_probe = { +static struct platform_driver qcom_irq_combiner_driver = { .driver = { .name = "qcom-irq-combiner", .acpi_match_table = ACPI_PTR(qcom_irq_combiner_ids), }, .probe = combiner_probe, }; -builtin_platform_driver(qcom_irq_combiner_probe); +builtin_platform_driver(qcom_irq_combiner_driver); diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c index 52d77546aacb..518f7f0f3dab 100644 --- a/drivers/irqchip/qcom-pdc.c +++ b/drivers/irqchip/qcom-pdc.c @@ -350,9 +350,10 @@ static int pdc_setup_pin_mapping(struct device_node *np) #define QCOM_PDC_SIZE 0x30000 -static int qcom_pdc_init(struct device_node *node, struct device_node *parent) +static int qcom_pdc_probe(struct platform_device *pdev, struct device_node *parent) { struct irq_domain *parent_domain, *pdc_domain; + struct device_node *node = pdev->dev.of_node; resource_size_t res_size; struct resource res; int ret; @@ -428,7 +429,7 @@ fail: } IRQCHIP_PLATFORM_DRIVER_BEGIN(qcom_pdc) -IRQCHIP_MATCH("qcom,pdc", qcom_pdc_init) +IRQCHIP_MATCH("qcom,pdc", qcom_pdc_probe) IRQCHIP_PLATFORM_DRIVER_END(qcom_pdc) MODULE_DESCRIPTION("Qualcomm Technologies, Inc. Power Domain Controller"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/isdn/hardware/mISDN/hfcsusb.c b/drivers/isdn/hardware/mISDN/hfcsusb.c index e54419a4e731..541a20cb58f1 100644 --- a/drivers/isdn/hardware/mISDN/hfcsusb.c +++ b/drivers/isdn/hardware/mISDN/hfcsusb.c @@ -1904,13 +1904,13 @@ out: mISDN_freebchannel(&hw->bch[1]); mISDN_freebchannel(&hw->bch[0]); mISDN_freedchannel(&hw->dch); - kfree(hw); return err; } static int hfcsusb_probe(struct usb_interface *intf, const struct usb_device_id *id) { + int err; struct hfcsusb *hw; struct usb_device *dev = interface_to_usbdev(intf); struct usb_host_interface *iface = intf->cur_altsetting; @@ -2101,20 +2101,28 @@ hfcsusb_probe(struct usb_interface *intf, const struct usb_device_id *id) if (!hw->ctrl_urb) { pr_warn("%s: No memory for control urb\n", driver_info->vend_name); - kfree(hw); - return -ENOMEM; + err = -ENOMEM; + goto err_free_hw; } pr_info("%s: %s: detected \"%s\" (%s, if=%d alt=%d)\n", hw->name, __func__, driver_info->vend_name, conf_str[small_match], ifnum, alt_used); - if (setup_instance(hw, dev->dev.parent)) - return -EIO; + if (setup_instance(hw, dev->dev.parent)) { + err = -EIO; + goto err_free_urb; + } hw->intf = intf; usb_set_intfdata(hw->intf, hw); return 0; + +err_free_urb: + usb_free_urb(hw->ctrl_urb); +err_free_hw: + kfree(hw); + return err; } /* function called when an active device is removed */ diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c index c9dd8c42c0cd..3a28ab5c42e5 100644 --- a/drivers/mailbox/mailbox-test.c +++ b/drivers/mailbox/mailbox-test.c @@ -268,7 +268,7 @@ static int mbox_test_add_debugfs(struct platform_device *pdev, return 0; tdev->root_debugfs_dir = debugfs_create_dir(dev_name(&pdev->dev), NULL); - if (!tdev->root_debugfs_dir) { + if (IS_ERR(tdev->root_debugfs_dir)) { dev_err(&pdev->dev, "Failed to create Mailbox debugfs\n"); return -EINVAL; } diff --git a/drivers/mailbox/mailbox-th1520.c b/drivers/mailbox/mailbox-th1520.c index a6b2aa9ae952..626957c2e435 100644 --- a/drivers/mailbox/mailbox-th1520.c +++ b/drivers/mailbox/mailbox-th1520.c @@ -435,10 +435,8 @@ static int th1520_mbox_probe(struct platform_device *pdev) } ret = devm_add_action_or_reset(dev, th1520_disable_clk, priv); - if (ret) { - clk_bulk_disable_unprepare(ARRAY_SIZE(priv->clocks), priv->clocks); + if (ret) return ret; - } /* * The address mappings in the device tree align precisely with those diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c index 654a60f63756..5791f80f995a 100644 --- a/drivers/mailbox/mtk-cmdq-mailbox.c +++ b/drivers/mailbox/mtk-cmdq-mailbox.c @@ -92,6 +92,18 @@ struct gce_plat { u32 gce_num; }; +static inline u32 cmdq_convert_gce_addr(dma_addr_t addr, const struct gce_plat *pdata) +{ + /* Convert DMA addr (PA or IOVA) to GCE readable addr */ + return addr >> pdata->shift; +} + +static inline dma_addr_t cmdq_revert_gce_addr(u32 addr, const struct gce_plat *pdata) +{ + /* Revert GCE readable addr to DMA addr (PA or IOVA) */ + return (dma_addr_t)addr << pdata->shift; +} + u8 cmdq_get_shift_pa(struct mbox_chan *chan) { struct cmdq *cmdq = container_of(chan->mbox, struct cmdq, mbox); @@ -188,13 +200,12 @@ static void cmdq_task_insert_into_thread(struct cmdq_task *task) struct cmdq_task *prev_task = list_last_entry( &thread->task_busy_list, typeof(*task), list_entry); u64 *prev_task_base = prev_task->pkt->va_base; + u32 gce_addr = cmdq_convert_gce_addr(task->pa_base, task->cmdq->pdata); /* let previous task jump to this task */ dma_sync_single_for_cpu(dev, prev_task->pa_base, prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE); - prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] = - (u64)CMDQ_JUMP_BY_PA << 32 | - (task->pa_base >> task->cmdq->pdata->shift); + prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] = (u64)CMDQ_JUMP_BY_PA << 32 | gce_addr; dma_sync_single_for_device(dev, prev_task->pa_base, prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE); @@ -237,7 +248,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq, struct cmdq_thread *thread) { struct cmdq_task *task, *tmp, *curr_task = NULL; - u32 curr_pa, irq_flag, task_end_pa; + u32 irq_flag, gce_addr; + dma_addr_t curr_pa, task_end_pa; bool err; irq_flag = readl(thread->base + CMDQ_THR_IRQ_STATUS); @@ -259,7 +271,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq, else return; - curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) << cmdq->pdata->shift; + gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR); + curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata); list_for_each_entry_safe(task, tmp, &thread->task_busy_list, list_entry) { @@ -378,7 +391,8 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data) struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv; struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev); struct cmdq_task *task; - unsigned long curr_pa, end_pa; + u32 gce_addr; + dma_addr_t curr_pa, end_pa; /* Client should not flush new tasks if suspended. */ WARN_ON(cmdq->suspended); @@ -402,20 +416,20 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data) */ WARN_ON(cmdq_thread_reset(cmdq, thread) < 0); - writel(task->pa_base >> cmdq->pdata->shift, - thread->base + CMDQ_THR_CURR_ADDR); - writel((task->pa_base + pkt->cmd_buf_size) >> cmdq->pdata->shift, - thread->base + CMDQ_THR_END_ADDR); + gce_addr = cmdq_convert_gce_addr(task->pa_base, cmdq->pdata); + writel(gce_addr, thread->base + CMDQ_THR_CURR_ADDR); + gce_addr = cmdq_convert_gce_addr(task->pa_base + pkt->cmd_buf_size, cmdq->pdata); + writel(gce_addr, thread->base + CMDQ_THR_END_ADDR); writel(thread->priority, thread->base + CMDQ_THR_PRIORITY); writel(CMDQ_THR_IRQ_EN, thread->base + CMDQ_THR_IRQ_ENABLE); writel(CMDQ_THR_ENABLED, thread->base + CMDQ_THR_ENABLE_TASK); } else { WARN_ON(cmdq_thread_suspend(cmdq, thread) < 0); - curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) << - cmdq->pdata->shift; - end_pa = readl(thread->base + CMDQ_THR_END_ADDR) << - cmdq->pdata->shift; + gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR); + curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata); + gce_addr = readl(thread->base + CMDQ_THR_END_ADDR); + end_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata); /* check boundary */ if (curr_pa == end_pa - CMDQ_INST_SIZE || curr_pa == end_pa) { @@ -646,6 +660,9 @@ static int cmdq_probe(struct platform_device *pdev) if (err) return err; + dma_set_coherent_mask(dev, + DMA_BIT_MASK(sizeof(u32) * BITS_PER_BYTE + cmdq->pdata->shift)); + cmdq->mbox.dev = dev; cmdq->mbox.chans = devm_kcalloc(dev, cmdq->pdata->thread_nr, sizeof(*cmdq->mbox.chans), GFP_KERNEL); diff --git a/drivers/mailbox/mtk-gpueb-mailbox.c b/drivers/mailbox/mtk-gpueb-mailbox.c index 925bcf21f650..f6d2beccd91b 100644 --- a/drivers/mailbox/mtk-gpueb-mailbox.c +++ b/drivers/mailbox/mtk-gpueb-mailbox.c @@ -200,7 +200,7 @@ static bool mtk_gpueb_mbox_last_tx_done(struct mbox_chan *chan) return !(readl(ch->ebm->mbox_ctl + GPUEB_MBOX_CTL_TX_STS) & BIT(ch->num)); } -const struct mbox_chan_ops mtk_gpueb_mbox_ops = { +static const struct mbox_chan_ops mtk_gpueb_mbox_ops = { .send_data = mtk_gpueb_mbox_send_data, .startup = mtk_gpueb_mbox_startup, .shutdown = mtk_gpueb_mbox_shutdown, diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c index 680243751d62..17fe6545875d 100644 --- a/drivers/mailbox/omap-mailbox.c +++ b/drivers/mailbox/omap-mailbox.c @@ -68,6 +68,7 @@ struct omap_mbox_fifo { struct omap_mbox_match_data { u32 intr_type; + bool is_exclusive; }; struct omap_mbox_device { @@ -78,6 +79,7 @@ struct omap_mbox_device { u32 num_users; u32 num_fifos; u32 intr_type; + const struct omap_mbox_match_data *mbox_data; }; struct omap_mbox { @@ -341,11 +343,13 @@ static int omap_mbox_suspend(struct device *dev) if (pm_runtime_status_suspended(dev)) return 0; - for (fifo = 0; fifo < mdev->num_fifos; fifo++) { - if (mbox_read_reg(mdev, MAILBOX_MSGSTATUS(fifo))) { - dev_err(mdev->dev, "fifo %d has unexpected unread messages\n", - fifo); - return -EBUSY; + if (mdev->mbox_data->is_exclusive) { + for (fifo = 0; fifo < mdev->num_fifos; fifo++) { + if (mbox_read_reg(mdev, MAILBOX_MSGSTATUS(fifo))) { + dev_err(mdev->dev, "fifo %d has unexpected unread messages\n", + fifo); + return -EBUSY; + } } } @@ -378,8 +382,9 @@ static const struct dev_pm_ops omap_mbox_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(omap_mbox_suspend, omap_mbox_resume) }; -static const struct omap_mbox_match_data omap2_data = { MBOX_INTR_CFG_TYPE1 }; -static const struct omap_mbox_match_data omap4_data = { MBOX_INTR_CFG_TYPE2 }; +static const struct omap_mbox_match_data omap2_data = { MBOX_INTR_CFG_TYPE1, true }; +static const struct omap_mbox_match_data omap4_data = { MBOX_INTR_CFG_TYPE2, true }; +static const struct omap_mbox_match_data am654_data = { MBOX_INTR_CFG_TYPE2, false }; static const struct of_device_id omap_mailbox_of_match[] = { { @@ -396,11 +401,11 @@ static const struct of_device_id omap_mailbox_of_match[] = { }, { .compatible = "ti,am654-mailbox", - .data = &omap4_data, + .data = &am654_data, }, { .compatible = "ti,am64-mailbox", - .data = &omap4_data, + .data = &am654_data, }, { /* end */ @@ -449,7 +454,6 @@ static int omap_mbox_probe(struct platform_device *pdev) struct omap_mbox_fifo *fifo; struct device_node *node = pdev->dev.of_node; struct device_node *child; - const struct omap_mbox_match_data *match_data; struct mbox_controller *controller; u32 intr_type, info_count; u32 num_users, num_fifos; @@ -462,11 +466,6 @@ static int omap_mbox_probe(struct platform_device *pdev) return -ENODEV; } - match_data = of_device_get_match_data(&pdev->dev); - if (!match_data) - return -ENODEV; - intr_type = match_data->intr_type; - if (of_property_read_u32(node, "ti,mbox-num-users", &num_users)) return -ENODEV; @@ -483,6 +482,12 @@ static int omap_mbox_probe(struct platform_device *pdev) if (!mdev) return -ENOMEM; + mdev->mbox_data = device_get_match_data(&pdev->dev); + if (!mdev->mbox_data) + return -ENODEV; + + intr_type = mdev->mbox_data->intr_type; + mdev->mbox_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(mdev->mbox_base)) return PTR_ERR(mdev->mbox_base); diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index 0a00719b2482..ff292b9e0be9 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -276,9 +276,8 @@ static int pcc_mbox_error_check_and_clear(struct pcc_chan_info *pchan) if (ret) return ret; - val &= pchan->error.status_mask; - if (val) { - val &= ~pchan->error.status_mask; + if (val & pchan->error.status_mask) { + val &= pchan->error.preserve_mask; pcc_chan_reg_write(&pchan->error, val); return -EIO; } @@ -745,7 +744,8 @@ static int pcc_parse_subspace_db_reg(struct pcc_chan_info *pchan, ret = pcc_chan_reg_init(&pchan->error, &pcct_ext->error_status_register, - 0, 0, pcct_ext->error_status_mask, + ~pcct_ext->error_status_mask, 0, + pcct_ext->error_status_mask, "Error Status"); } return ret; diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile index 86776e4acad2..cedfd38854f6 100644 --- a/drivers/md/dm-pcache/Makefile +++ b/drivers/md/dm-pcache/Makefile @@ -1,3 +1,3 @@ dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o -obj-m += dm-pcache.o +obj-$(CONFIG_DM_PCACHE) += dm-pcache.o diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c index d8e92367d947..698697a7a73c 100644 --- a/drivers/md/dm-pcache/cache.c +++ b/drivers/md/dm-pcache/cache.c @@ -181,7 +181,7 @@ static void cache_info_init_default(struct pcache_cache *cache) { struct pcache_cache_info *cache_info = &cache->cache_info; - cache_info->header.seq = 0; + memset(cache_info, 0, sizeof(*cache_info)); cache_info->n_segs = cache->cache_dev->seg_num; cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT); } @@ -411,7 +411,7 @@ void pcache_cache_stop(struct dm_pcache *pcache) { struct pcache_cache *cache = &pcache->cache; - cache_flush(cache); + pcache_cache_flush(cache); cancel_delayed_work_sync(&cache->gc_work); flush_work(&cache->clean_work); diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h index 1136d86958c8..27613b56be54 100644 --- a/drivers/md/dm-pcache/cache.h +++ b/drivers/md/dm-pcache/cache.h @@ -339,7 +339,7 @@ void cache_seg_put(struct pcache_cache_segment *cache_seg); void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id); /* cache request*/ -int cache_flush(struct pcache_cache *cache); +int pcache_cache_flush(struct pcache_cache *cache); void miss_read_end_work_fn(struct work_struct *work); int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req); diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c index 27f94c1fa968..7854a30e07b7 100644 --- a/drivers/md/dm-pcache/cache_req.c +++ b/drivers/md/dm-pcache/cache_req.c @@ -790,7 +790,7 @@ err: } /** - * cache_flush - Flush all ksets to persist any pending cache data + * pcache_cache_flush - Flush all ksets to persist any pending cache data * @cache: Pointer to the cache structure * * This function iterates through all ksets associated with the provided `cache` @@ -802,7 +802,7 @@ err: * the respective error code, preventing the flush operation from proceeding to * subsequent ksets. */ -int cache_flush(struct pcache_cache *cache) +int pcache_cache_flush(struct pcache_cache *cache) { struct pcache_cache_kset *kset; int ret; @@ -827,7 +827,7 @@ int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *p struct bio *bio = pcache_req->bio; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) - return cache_flush(cache); + return pcache_cache_flush(cache); if (bio_data_dir(bio) == READ) return cache_read(cache, pcache_req); diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h index d427e534727c..b7a3319d2bd3 100644 --- a/drivers/md/dm-pcache/pcache_internal.h +++ b/drivers/md/dm-pcache/pcache_internal.h @@ -99,7 +99,7 @@ static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_head /* Update latest if a more recent sequence is found */ if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) { seq_latest = meta->seq; - latest = (void *)header + (i * meta_max_size); + latest = meta_addr; } } diff --git a/drivers/md/dm-vdo/logger.c b/drivers/md/dm-vdo/logger.c index 3f7dc2cb6b98..76a987ccf926 100644 --- a/drivers/md/dm-vdo/logger.c +++ b/drivers/md/dm-vdo/logger.c @@ -34,7 +34,7 @@ static const char *get_current_interrupt_type(void) if (in_nmi()) return "NMI"; - if (in_irq()) + if (in_hardirq()) return "HI"; if (in_softirq()) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index d382a390d39a..72047b47a7a0 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -320,11 +320,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) if (fio->bufs[n]) continue; - fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT); - if (unlikely(!fio->bufs[n])) { - DMERR("failed to allocate FEC buffer"); - return -ENOMEM; - } + fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOIO); } /* try to allocate the maximum number of buffers */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f5e5e59b232b..6c83ab940af7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2005,7 +2005,7 @@ static void dm_split_and_process_bio(struct mapped_device *md, * linear target or multiple linear targets pointing to the same * device), we can send the flush with data directly to it. */ - if (map->flush_bypasses_map) { + if (bio->bi_iter.bi_size && map->flush_bypasses_map) { struct list_head *devices = dm_table_get_devices(map); if (devices->next == devices->prev) goto send_preflush_with_data; diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c index d911021c1bb0..83862d57b126 100644 --- a/drivers/media/common/videobuf2/videobuf2-v4l2.c +++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c @@ -1010,6 +1010,11 @@ int vb2_ioctl_remove_bufs(struct file *file, void *priv, if (vb2_queue_is_busy(vdev->queue, file)) return -EBUSY; + if (vb2_fileio_is_active(vdev->queue)) { + dprintk(vdev->queue, 1, "file io in progress\n"); + return -EBUSY; + } + return vb2_core_remove_bufs(vdev->queue, d->index, d->count); } EXPORT_SYMBOL_GPL(vb2_ioctl_remove_bufs); diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c index f66f728b1b43..2ac9ac0a740b 100644 --- a/drivers/media/mc/mc-request.c +++ b/drivers/media/mc/mc-request.c @@ -282,8 +282,6 @@ EXPORT_SYMBOL_GPL(media_request_get_by_fd); int media_request_alloc(struct media_device *mdev, int *alloc_fd) { struct media_request *req; - struct file *filp; - int fd; int ret; /* Either both are NULL or both are non-NULL */ @@ -297,19 +295,6 @@ int media_request_alloc(struct media_device *mdev, int *alloc_fd) if (!req) return -ENOMEM; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - ret = fd; - goto err_free_req; - } - - filp = anon_inode_getfile("request", &request_fops, NULL, O_CLOEXEC); - if (IS_ERR(filp)) { - ret = PTR_ERR(filp); - goto err_put_fd; - } - - filp->private_data = req; req->mdev = mdev; req->state = MEDIA_REQUEST_STATE_IDLE; req->num_incomplete_objects = 0; @@ -320,19 +305,24 @@ int media_request_alloc(struct media_device *mdev, int *alloc_fd) req->updating_count = 0; req->access_count = 0; - *alloc_fd = fd; + FD_PREPARE(fdf, O_CLOEXEC, + anon_inode_getfile("request", &request_fops, NULL, + O_CLOEXEC)); + if (fdf.err) { + ret = fdf.err; + goto err_free_req; + } + + fd_prepare_file(fdf)->private_data = req; + + *alloc_fd = fd_publish(fdf); snprintf(req->debug_str, sizeof(req->debug_str), "%u:%d", - atomic_inc_return(&mdev->request_id), fd); + atomic_inc_return(&mdev->request_id), *alloc_fd); dev_dbg(mdev->dev, "request: allocated %s\n", req->debug_str); - fd_install(fd, filp); - return 0; -err_put_fd: - put_unused_fd(fd); - err_free_req: if (mdev->ops->req_free) mdev->ops->req_free(req); diff --git a/drivers/media/pci/cx18/cx18-driver.c b/drivers/media/pci/cx18/cx18-driver.c index b62fd12c93c1..74c59a94b2b0 100644 --- a/drivers/media/pci/cx18/cx18-driver.c +++ b/drivers/media/pci/cx18/cx18-driver.c @@ -1136,11 +1136,8 @@ int cx18_init_on_first_open(struct cx18 *cx) int video_input; int fw_retry_count = 3; struct v4l2_frequency vf; - struct cx18_open_id fh; v4l2_std_id std; - fh.cx = cx; - if (test_bit(CX18_F_I_FAILED, &cx->i_flags)) return -ENXIO; @@ -1220,14 +1217,14 @@ int cx18_init_on_first_open(struct cx18 *cx) video_input = cx->active_input; cx->active_input++; /* Force update of input */ - cx18_s_input(NULL, &fh, video_input); + cx18_do_s_input(cx, video_input); /* Let the VIDIOC_S_STD ioctl do all the work, keeps the code in one place. */ cx->std++; /* Force full standard initialization */ std = (cx->tuner_std == V4L2_STD_ALL) ? V4L2_STD_NTSC_M : cx->tuner_std; - cx18_s_std(NULL, &fh, std); - cx18_s_frequency(NULL, &fh, &vf); + cx18_do_s_std(cx, std); + cx18_do_s_frequency(cx, &vf); return 0; } diff --git a/drivers/media/pci/cx18/cx18-ioctl.c b/drivers/media/pci/cx18/cx18-ioctl.c index 0f3019739d03..0d676a57e24e 100644 --- a/drivers/media/pci/cx18/cx18-ioctl.c +++ b/drivers/media/pci/cx18/cx18-ioctl.c @@ -521,10 +521,8 @@ static int cx18_g_input(struct file *file, void *fh, unsigned int *i) return 0; } -int cx18_s_input(struct file *file, void *fh, unsigned int inp) +int cx18_do_s_input(struct cx18 *cx, unsigned int inp) { - struct cx18_open_id *id = file2id(file); - struct cx18 *cx = id->cx; v4l2_std_id std = V4L2_STD_ALL; const struct cx18_card_video_input *card_input = cx->card->video_inputs + inp; @@ -558,6 +556,11 @@ int cx18_s_input(struct file *file, void *fh, unsigned int inp) return 0; } +static int cx18_s_input(struct file *file, void *fh, unsigned int inp) +{ + return cx18_do_s_input(file2id(file)->cx, inp); +} + static int cx18_g_frequency(struct file *file, void *fh, struct v4l2_frequency *vf) { @@ -570,11 +573,8 @@ static int cx18_g_frequency(struct file *file, void *fh, return 0; } -int cx18_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf) +int cx18_do_s_frequency(struct cx18 *cx, const struct v4l2_frequency *vf) { - struct cx18_open_id *id = file2id(file); - struct cx18 *cx = id->cx; - if (vf->tuner != 0) return -EINVAL; @@ -585,6 +585,12 @@ int cx18_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *v return 0; } +static int cx18_s_frequency(struct file *file, void *fh, + const struct v4l2_frequency *vf) +{ + return cx18_do_s_frequency(file2id(file)->cx, vf); +} + static int cx18_g_std(struct file *file, void *fh, v4l2_std_id *std) { struct cx18 *cx = file2id(file)->cx; @@ -593,11 +599,8 @@ static int cx18_g_std(struct file *file, void *fh, v4l2_std_id *std) return 0; } -int cx18_s_std(struct file *file, void *fh, v4l2_std_id std) +int cx18_do_s_std(struct cx18 *cx, v4l2_std_id std) { - struct cx18_open_id *id = file2id(file); - struct cx18 *cx = id->cx; - if ((std & V4L2_STD_ALL) == 0) return -EINVAL; @@ -642,6 +645,11 @@ int cx18_s_std(struct file *file, void *fh, v4l2_std_id std) return 0; } +static int cx18_s_std(struct file *file, void *fh, v4l2_std_id std) +{ + return cx18_do_s_std(file2id(file)->cx, std); +} + static int cx18_s_tuner(struct file *file, void *fh, const struct v4l2_tuner *vt) { struct cx18_open_id *id = file2id(file); diff --git a/drivers/media/pci/cx18/cx18-ioctl.h b/drivers/media/pci/cx18/cx18-ioctl.h index 97cd9f99e22d..42a8acd69735 100644 --- a/drivers/media/pci/cx18/cx18-ioctl.h +++ b/drivers/media/pci/cx18/cx18-ioctl.h @@ -12,6 +12,8 @@ u16 cx18_service2vbi(int type); void cx18_expand_service_set(struct v4l2_sliced_vbi_format *fmt, int is_pal); u16 cx18_get_service_set(struct v4l2_sliced_vbi_format *fmt); void cx18_set_funcs(struct video_device *vdev); -int cx18_s_std(struct file *file, void *fh, v4l2_std_id std); -int cx18_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf); -int cx18_s_input(struct file *file, void *fh, unsigned int inp); + +struct cx18; +int cx18_do_s_std(struct cx18 *cx, v4l2_std_id std); +int cx18_do_s_frequency(struct cx18 *cx, const struct v4l2_frequency *vf); +int cx18_do_s_input(struct cx18 *cx, unsigned int inp); diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c index 72a8f76a41f4..459eb6cc370c 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.c +++ b/drivers/media/pci/ivtv/ivtv-driver.c @@ -1247,15 +1247,12 @@ err: int ivtv_init_on_first_open(struct ivtv *itv) { - struct v4l2_frequency vf; /* Needed to call ioctls later */ - struct ivtv_open_id fh; + struct ivtv_stream *s = &itv->streams[IVTV_ENC_STREAM_TYPE_MPG]; + struct v4l2_frequency vf; int fw_retry_count = 3; int video_input; - fh.itv = itv; - fh.type = IVTV_ENC_STREAM_TYPE_MPG; - if (test_bit(IVTV_F_I_FAILED, &itv->i_flags)) return -ENXIO; @@ -1297,13 +1294,13 @@ int ivtv_init_on_first_open(struct ivtv *itv) video_input = itv->active_input; itv->active_input++; /* Force update of input */ - ivtv_s_input(NULL, &fh, video_input); + ivtv_do_s_input(itv, video_input); /* Let the VIDIOC_S_STD ioctl do all the work, keeps the code in one place. */ itv->std++; /* Force full standard initialization */ itv->std_out = itv->std; - ivtv_s_frequency(NULL, &fh, &vf); + ivtv_do_s_frequency(s, &vf); if (itv->card->v4l2_capabilities & V4L2_CAP_VIDEO_OUTPUT) { /* Turn on the TV-out: ivtv_init_mpeg_decoder() initializes diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c index 84c73bd22f2d..8d5ea3aec06f 100644 --- a/drivers/media/pci/ivtv/ivtv-ioctl.c +++ b/drivers/media/pci/ivtv/ivtv-ioctl.c @@ -974,9 +974,8 @@ static int ivtv_g_input(struct file *file, void *fh, unsigned int *i) return 0; } -int ivtv_s_input(struct file *file, void *fh, unsigned int inp) +int ivtv_do_s_input(struct ivtv *itv, unsigned int inp) { - struct ivtv *itv = file2id(file)->itv; v4l2_std_id std; int i; @@ -1017,6 +1016,11 @@ int ivtv_s_input(struct file *file, void *fh, unsigned int inp) return 0; } +static int ivtv_s_input(struct file *file, void *fh, unsigned int inp) +{ + return ivtv_do_s_input(file2id(file)->itv, inp); +} + static int ivtv_g_output(struct file *file, void *fh, unsigned int *i) { struct ivtv *itv = file2id(file)->itv; @@ -1065,10 +1069,9 @@ static int ivtv_g_frequency(struct file *file, void *fh, struct v4l2_frequency * return 0; } -int ivtv_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf) +int ivtv_do_s_frequency(struct ivtv_stream *s, const struct v4l2_frequency *vf) { - struct ivtv *itv = file2id(file)->itv; - struct ivtv_stream *s = &itv->streams[file2id(file)->type]; + struct ivtv *itv = s->itv; if (s->vdev.vfl_dir) return -ENOTTY; @@ -1082,6 +1085,15 @@ int ivtv_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *v return 0; } +static int ivtv_s_frequency(struct file *file, void *fh, + const struct v4l2_frequency *vf) +{ + struct ivtv_open_id *id = file2id(file); + struct ivtv *itv = id->itv; + + return ivtv_do_s_frequency(&itv->streams[id->type], vf); +} + static int ivtv_g_std(struct file *file, void *fh, v4l2_std_id *std) { struct ivtv *itv = file2id(file)->itv; diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.h b/drivers/media/pci/ivtv/ivtv-ioctl.h index 7f8c6f43d397..96ca7e2ef973 100644 --- a/drivers/media/pci/ivtv/ivtv-ioctl.h +++ b/drivers/media/pci/ivtv/ivtv-ioctl.h @@ -9,6 +9,8 @@ #ifndef IVTV_IOCTL_H #define IVTV_IOCTL_H +struct ivtv; + u16 ivtv_service2vbi(int type); void ivtv_expand_service_set(struct v4l2_sliced_vbi_format *fmt, int is_pal); u16 ivtv_get_service_set(struct v4l2_sliced_vbi_format *fmt); @@ -17,7 +19,7 @@ int ivtv_set_speed(struct ivtv *itv, int speed); void ivtv_set_funcs(struct video_device *vdev); void ivtv_s_std_enc(struct ivtv *itv, v4l2_std_id std); void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std); -int ivtv_s_frequency(struct file *file, void *fh, const struct v4l2_frequency *vf); -int ivtv_s_input(struct file *file, void *fh, unsigned int inp); +int ivtv_do_s_frequency(struct ivtv_stream *s, const struct v4l2_frequency *vf); +int ivtv_do_s_input(struct ivtv *itv, unsigned int inp); #endif diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index fb6afb8e84f0..ee4f54d68349 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -167,13 +167,26 @@ static struct uvc_entity *uvc_entity_by_reference(struct uvc_device *dev, static struct uvc_streaming *uvc_stream_by_id(struct uvc_device *dev, int id) { - struct uvc_streaming *stream; + struct uvc_streaming *stream, *last_stream; + unsigned int count = 0; list_for_each_entry(stream, &dev->streams, list) { + count += 1; + last_stream = stream; if (stream->header.bTerminalLink == id) return stream; } + /* + * If the streaming entity is referenced by an invalid ID, notify the + * user and use heuristics to guess the correct entity. + */ + if (count == 1 && id == UVC_INVALID_ENTITY_ID) { + dev_warn(&dev->intf->dev, + "UVC non compliance: Invalid USB header. The streaming entity has an invalid ID, guessing the correct one."); + return last_stream; + } + return NULL; } diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c index 1da953629010..25e66bf18f5f 100644 --- a/drivers/media/v4l2-core/v4l2-subdev.c +++ b/drivers/media/v4l2-core/v4l2-subdev.c @@ -2608,7 +2608,7 @@ EXPORT_SYMBOL_GPL(v4l2_subdev_is_streaming); int v4l2_subdev_get_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) - sd->privacy_led = led_get(sd->dev, "privacy-led"); + sd->privacy_led = led_get(sd->dev, "privacy"); if (IS_ERR(sd->privacy_led) && PTR_ERR(sd->privacy_led) != -ENOENT) return dev_err_probe(sd->dev, PTR_ERR(sd->privacy_led), "getting privacy LED\n"); diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c index cfa61dd88557..3c2949c16fde 100644 --- a/drivers/memory/tegra/tegra210.c +++ b/drivers/memory/tegra/tegra210.c @@ -1015,7 +1015,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = { }, }, }, { - .id = TEGRA210_MC_SESRD, + .id = TEGRA210_MC_SESWR, .name = "seswr", .swgroup = TEGRA_SWGROUP_SE, .regs = { @@ -1079,7 +1079,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = { }, }, }, { - .id = TEGRA210_MC_ETRR, + .id = TEGRA210_MC_ETRW, .name = "etrw", .swgroup = TEGRA_SWGROUP_ETR, .regs = { diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index b017ff29dbd1..73cad914be9f 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -223,6 +223,10 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw->mem_addr = pcim_iomap_table(pdev)[0]; hw->read_fws = mei_me_read_fws; + err = mei_register(dev, &pdev->dev); + if (err) + goto end; + pci_enable_msi(pdev); hw->irq = pdev->irq; @@ -237,13 +241,9 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) { dev_err(&pdev->dev, "request_threaded_irq failure. irq = %d\n", pdev->irq); - goto end; + goto deregister; } - err = mei_register(dev, &pdev->dev); - if (err) - goto release_irq; - if (mei_start(dev)) { dev_err(&pdev->dev, "init hw failure.\n"); err = -ENODEV; @@ -283,11 +283,10 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; deregister: - mei_deregister(dev); -release_irq: mei_cancel_work(dev); mei_disable_interrupts(dev); free_irq(pdev->irq, dev); + mei_deregister(dev); end: dev_err(&pdev->dev, "initialization failed.\n"); return err; diff --git a/drivers/misc/mei/pci-txe.c b/drivers/misc/mei/pci-txe.c index 06b55a891c6b..98d1bc2c7f4b 100644 --- a/drivers/misc/mei/pci-txe.c +++ b/drivers/misc/mei/pci-txe.c @@ -87,6 +87,10 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw = to_txe_hw(dev); hw->mem_addr = pcim_iomap_table(pdev); + err = mei_register(dev, &pdev->dev); + if (err) + goto end; + pci_enable_msi(pdev); /* clear spurious interrupts */ @@ -106,13 +110,9 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) { dev_err(&pdev->dev, "mei: request_threaded_irq failure. irq = %d\n", pdev->irq); - goto end; + goto deregister; } - err = mei_register(dev, &pdev->dev); - if (err) - goto release_irq; - if (mei_start(dev)) { dev_err(&pdev->dev, "init hw failure.\n"); err = -ENODEV; @@ -145,11 +145,10 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; deregister: - mei_deregister(dev); -release_irq: mei_cancel_work(dev); mei_disable_interrupts(dev); free_irq(pdev->irq, dev); + mei_deregister(dev); end: dev_err(&pdev->dev, "initialization failed.\n"); return err; diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c index 288e7b72e942..9787b9cee71c 100644 --- a/drivers/misc/mei/platform-vsc.c +++ b/drivers/misc/mei/platform-vsc.c @@ -362,28 +362,27 @@ static int mei_vsc_probe(struct platform_device *pdev) ret = mei_register(mei_dev, dev); if (ret) - goto err_dereg; + goto err; ret = mei_start(mei_dev); if (ret) { dev_err_probe(dev, ret, "init hw failed\n"); - goto err_cancel; + goto err; } pm_runtime_enable(mei_dev->parent); return 0; -err_dereg: - mei_deregister(mei_dev); - -err_cancel: +err: mei_cancel_work(mei_dev); vsc_tp_register_event_cb(tp, NULL, NULL); mei_disable_interrupts(mei_dev); + mei_deregister(mei_dev); + return ret; } diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 999026a1ae04..9087f045e362 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -721,21 +721,12 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, static int ntsync_obj_get_fd(struct ntsync_obj *obj) { - struct file *file; - int fd; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - return fd; - file = anon_inode_getfile("ntsync", &ntsync_obj_fops, obj, O_RDWR); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - obj->file = file; - fd_install(fd, file); - - return fd; + FD_PREPARE(fdf, O_CLOEXEC, + anon_inode_getfile("ntsync", &ntsync_obj_fops, obj, O_RDWR)); + if (fdf.err) + return fdf.err; + obj->file = fd_prepare_file(fdf); + return fd_publish(fdf); } static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 2c963cb6724b..10d0ef58ef49 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -950,7 +950,7 @@ config MMC_USHC config MMC_WMT tristate "Wondermedia SD/MMC Host Controller support" depends on ARCH_VT8500 || COMPILE_TEST - default y + default ARCH_VT8500 help This selects support for the SD/MMC Host Controller on Wondermedia WM8505/WM8650 based SoCs. diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index 82dd906bb002..681354942e97 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -42,7 +42,7 @@ struct dw_mci_rockchip_priv_data { */ static int rockchip_mmc_get_internal_phase(struct dw_mci *host, bool sample) { - unsigned long rate = clk_get_rate(host->ciu_clk); + unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV; u32 raw_value; u16 degrees; u32 delay_num = 0; @@ -85,7 +85,7 @@ static int rockchip_mmc_get_phase(struct dw_mci *host, bool sample) static int rockchip_mmc_set_internal_phase(struct dw_mci *host, bool sample, int degrees) { - unsigned long rate = clk_get_rate(host->ciu_clk); + unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV; u8 nineties, remainder; u8 delay_num; u32 raw_value; diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 26d03352af63..b5ea058ed467 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -652,10 +652,9 @@ static int pxamci_probe(struct platform_device *pdev) host->clkrt = CLKRT_OFF; host->clk = devm_clk_get(dev, NULL); - if (IS_ERR(host->clk)) { - host->clk = NULL; - return PTR_ERR(host->clk); - } + if (IS_ERR(host->clk)) + return dev_err_probe(dev, PTR_ERR(host->clk), + "Failed to acquire clock\n"); host->clkrate = clk_get_rate(host->clk); @@ -703,46 +702,37 @@ static int pxamci_probe(struct platform_device *pdev) platform_set_drvdata(pdev, mmc); - host->dma_chan_rx = dma_request_chan(dev, "rx"); - if (IS_ERR(host->dma_chan_rx)) { - host->dma_chan_rx = NULL; + host->dma_chan_rx = devm_dma_request_chan(dev, "rx"); + if (IS_ERR(host->dma_chan_rx)) return dev_err_probe(dev, PTR_ERR(host->dma_chan_rx), "unable to request rx dma channel\n"); - } - host->dma_chan_tx = dma_request_chan(dev, "tx"); - if (IS_ERR(host->dma_chan_tx)) { - dev_err(dev, "unable to request tx dma channel\n"); - ret = PTR_ERR(host->dma_chan_tx); - host->dma_chan_tx = NULL; - goto out; - } + + host->dma_chan_tx = devm_dma_request_chan(dev, "tx"); + if (IS_ERR(host->dma_chan_tx)) + return dev_err_probe(dev, PTR_ERR(host->dma_chan_tx), + "unable to request tx dma channel\n"); if (host->pdata) { host->detect_delay_ms = host->pdata->detect_delay_ms; host->power = devm_gpiod_get_optional(dev, "power", GPIOD_OUT_LOW); - if (IS_ERR(host->power)) { - ret = PTR_ERR(host->power); - dev_err(dev, "Failed requesting gpio_power\n"); - goto out; - } + if (IS_ERR(host->power)) + return dev_err_probe(dev, PTR_ERR(host->power), + "Failed requesting gpio_power\n"); /* FIXME: should we pass detection delay to debounce? */ ret = mmc_gpiod_request_cd(mmc, "cd", 0, false, 0); - if (ret && ret != -ENOENT) { - dev_err(dev, "Failed requesting gpio_cd\n"); - goto out; - } + if (ret && ret != -ENOENT) + return dev_err_probe(dev, ret, "Failed requesting gpio_cd\n"); if (!host->pdata->gpio_card_ro_invert) mmc->caps2 |= MMC_CAP2_RO_ACTIVE_HIGH; ret = mmc_gpiod_request_ro(mmc, "wp", 0, 0); - if (ret && ret != -ENOENT) { - dev_err(dev, "Failed requesting gpio_ro\n"); - goto out; - } + if (ret && ret != -ENOENT) + return dev_err_probe(dev, ret, "Failed requesting gpio_ro\n"); + if (!ret) host->use_ro_gpio = true; @@ -759,16 +749,8 @@ static int pxamci_probe(struct platform_device *pdev) if (ret) { if (host->pdata && host->pdata->exit) host->pdata->exit(dev, mmc); - goto out; } - return 0; - -out: - if (host->dma_chan_rx) - dma_release_channel(host->dma_chan_rx); - if (host->dma_chan_tx) - dma_release_channel(host->dma_chan_tx); return ret; } @@ -791,8 +773,6 @@ static void pxamci_remove(struct platform_device *pdev) dmaengine_terminate_all(host->dma_chan_rx); dmaengine_terminate_all(host->dma_chan_tx); - dma_release_channel(host->dma_chan_rx); - dma_release_channel(host->dma_chan_tx); } } diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index eebd45389956..4e256673a098 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -94,7 +94,7 @@ #define DLL_TXCLK_TAPNUM_DEFAULT 0x10 #define DLL_TXCLK_TAPNUM_90_DEGREES 0xA #define DLL_TXCLK_TAPNUM_FROM_SW BIT(24) -#define DLL_STRBIN_TAPNUM_DEFAULT 0x8 +#define DLL_STRBIN_TAPNUM_DEFAULT 0x4 #define DLL_STRBIN_TAPNUM_FROM_SW BIT(24) #define DLL_STRBIN_DELAY_NUM_SEL BIT(26) #define DLL_STRBIN_DELAY_NUM_OFFSET 16 @@ -289,6 +289,19 @@ static void dwcmshc_adma_write_desc(struct sdhci_host *host, void **desc, sdhci_adma_write_desc(host, desc, addr, len, cmd); } +static void dwcmshc_reset(struct sdhci_host *host, u8 mask) +{ + sdhci_reset(host, mask); + + /* The dwcmshc does not comply with the SDHCI specification + * regarding the "Software Reset for CMD line should clear 'Command + * Complete' in the Normal Interrupt Status Register." Clear the bit + * here to compensate for this quirk. + */ + if (mask & SDHCI_RESET_CMD) + sdhci_writel(host, SDHCI_INT_RESPONSE, SDHCI_INT_STATUS); +} + static unsigned int dwcmshc_get_max_clock(struct sdhci_host *host) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -832,15 +845,7 @@ static void th1520_sdhci_reset(struct sdhci_host *host, u8 mask) struct dwcmshc_priv *priv = sdhci_pltfm_priv(pltfm_host); u16 ctrl_2; - sdhci_reset(host, mask); - - /* The T-Head 1520 SoC does not comply with the SDHCI specification - * regarding the "Software Reset for CMD line should clear 'Command - * Complete' in the Normal Interrupt Status Register." Clear the bit - * here to compensate for this quirk. - */ - if (mask & SDHCI_RESET_CMD) - sdhci_writel(host, SDHCI_INT_RESPONSE, SDHCI_INT_STATUS); + dwcmshc_reset(host, mask); if (priv->flags & FLAG_IO_FIXED_1V8) { ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2); @@ -886,7 +891,7 @@ static void cv18xx_sdhci_reset(struct sdhci_host *host, u8 mask) struct dwcmshc_priv *priv = sdhci_pltfm_priv(pltfm_host); u32 val, emmc_caps = MMC_CAP2_NO_SD | MMC_CAP2_NO_SDIO; - sdhci_reset(host, mask); + dwcmshc_reset(host, mask); if ((host->mmc->caps2 & emmc_caps) == emmc_caps) { val = sdhci_readl(host, priv->vendor_specific_area1 + CV18XX_SDHCI_MSHC_CTRL); @@ -958,7 +963,7 @@ static void cv18xx_sdhci_post_tuning(struct sdhci_host *host) val |= SDHCI_INT_DATA_AVAIL; sdhci_writel(host, val, SDHCI_INT_STATUS); - sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA); + dwcmshc_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA); } static int cv18xx_sdhci_execute_tuning(struct sdhci_host *host, u32 opcode) @@ -1100,7 +1105,7 @@ static const struct sdhci_ops sdhci_dwcmshc_ops = { .set_bus_width = sdhci_set_bus_width, .set_uhs_signaling = dwcmshc_set_uhs_signaling, .get_max_clock = dwcmshc_get_max_clock, - .reset = sdhci_reset, + .reset = dwcmshc_reset, .adma_write_desc = dwcmshc_adma_write_desc, .irq = dwcmshc_cqe_irq_handler, }; diff --git a/drivers/most/most_usb.c b/drivers/most/most_usb.c index 10064d7b7249..41ee169f80c5 100644 --- a/drivers/most/most_usb.c +++ b/drivers/most/most_usb.c @@ -1058,7 +1058,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id) ret = most_register_interface(&mdev->iface); if (ret) - goto err_free_busy_urbs; + return ret; mutex_lock(&mdev->io_mutex); if (le16_to_cpu(usb_dev->descriptor.idProduct) == USB_DEV_ID_OS81118 || @@ -1068,8 +1068,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id) if (!mdev->dci) { mutex_unlock(&mdev->io_mutex); most_deregister_interface(&mdev->iface); - ret = -ENOMEM; - goto err_free_busy_urbs; + return -ENOMEM; } mdev->dci->dev.init_name = "dci"; @@ -1078,18 +1077,15 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id) mdev->dci->dev.release = release_dci; if (device_register(&mdev->dci->dev)) { mutex_unlock(&mdev->io_mutex); + put_device(&mdev->dci->dev); most_deregister_interface(&mdev->iface); - ret = -ENOMEM; - goto err_free_dci; + return -ENOMEM; } mdev->dci->usb_device = mdev->usb_device; } mutex_unlock(&mdev->io_mutex); return 0; -err_free_dci: - put_device(&mdev->dci->dev); -err_free_busy_urbs: - kfree(mdev->busy_urbs); + err_free_ep_address: kfree(mdev->ep_address); err_free_cap: diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 8dc4f5c493fc..335c702633ff 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -599,6 +599,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp) uint8_t *datbuf = NULL, *oobbuf = NULL; size_t datbuf_len, oobbuf_len; int ret = 0; + u64 end; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; @@ -618,7 +619,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp) req.len &= 0xffffffff; req.ooblen &= 0xffffffff; - if (req.start + req.len > mtd->size) + if (check_add_overflow(req.start, req.len, &end) || end > mtd->size) return -EINVAL; datbuf_len = min_t(size_t, req.len, mtd->erasesize); @@ -698,6 +699,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp) size_t datbuf_len, oobbuf_len; size_t orig_len, orig_ooblen; int ret = 0; + u64 end; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; @@ -724,7 +726,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp) req.len &= 0xffffffff; req.ooblen &= 0xffffffff; - if (req.start + req.len > mtd->size) { + if (check_add_overflow(req.start, req.len, &end) || end > mtd->size) { ret = -EINVAL; goto out; } diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 4a17271076bc..1e57c8de8578 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -63,7 +63,7 @@ config MTD_NAND_ECC_MEDIATEK config MTD_NAND_ECC_REALTEK tristate "Realtek RTL93xx hardware ECC engine" - depends on HAS_IOMEM + depends on HAS_IOMEM && HAS_DMA depends on MACH_REALTEK_RTL || COMPILE_TEST select MTD_NAND_ECC help diff --git a/drivers/mtd/nand/ecc-realtek.c b/drivers/mtd/nand/ecc-realtek.c index 7d718934c909..0046da37ea3e 100644 --- a/drivers/mtd/nand/ecc-realtek.c +++ b/drivers/mtd/nand/ecc-realtek.c @@ -380,7 +380,7 @@ static void rtl_ecc_cleanup_ctx(struct nand_device *nand) nand_ecc_cleanup_req_tweaking(&ctx->req_ctx); } -static struct nand_ecc_engine_ops rtl_ecc_engine_ops = { +static const struct nand_ecc_engine_ops rtl_ecc_engine_ops = { .init_ctx = rtl_ecc_init_ctx, .cleanup_ctx = rtl_ecc_cleanup_ctx, .prepare_io_req = rtl_ecc_prepare_io_req, @@ -418,8 +418,8 @@ static int rtl_ecc_probe(struct platform_device *pdev) rtlc->buf = dma_alloc_noncoherent(dev, RTL_ECC_DMA_SIZE, &rtlc->buf_dma, DMA_BIDIRECTIONAL, GFP_KERNEL); - if (IS_ERR(rtlc->buf)) - return PTR_ERR(rtlc->buf); + if (!rtlc->buf) + return -ENOMEM; rtlc->dev = dev; rtlc->engine.dev = dev; diff --git a/drivers/mtd/nand/onenand/onenand_samsung.c b/drivers/mtd/nand/onenand/onenand_samsung.c index f37a6138e461..6d6aa709a21f 100644 --- a/drivers/mtd/nand/onenand/onenand_samsung.c +++ b/drivers/mtd/nand/onenand/onenand_samsung.c @@ -906,7 +906,7 @@ static int s3c_onenand_probe(struct platform_device *pdev) err = devm_request_irq(&pdev->dev, r->start, s5pc110_onenand_irq, IRQF_SHARED, "onenand", - &onenand); + onenand); if (err) { dev_err(&pdev->dev, "failed to get irq\n"); return err; diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 6667eea95597..32ed38b89394 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -2871,7 +2871,7 @@ cadence_nand_irq_cleanup(int irqnum, struct cdns_nand_ctrl *cdns_ctrl) static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) { dma_cap_mask_t mask; - struct dma_device *dma_dev = cdns_ctrl->dmac->device; + struct dma_device *dma_dev; int ret; cdns_ctrl->cdma_desc = dma_alloc_coherent(cdns_ctrl->dev, @@ -2915,6 +2915,7 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) } } + dma_dev = cdns_ctrl->dmac->device; cdns_ctrl->io.iova_dma = dma_map_resource(dma_dev->dev, cdns_ctrl->io.dma, cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); diff --git a/drivers/mtd/nand/spi/fmsh.c b/drivers/mtd/nand/spi/fmsh.c index 8b2097bfc771..c2b9a8c113cb 100644 --- a/drivers/mtd/nand/spi/fmsh.c +++ b/drivers/mtd/nand/spi/fmsh.c @@ -58,7 +58,7 @@ static const struct spinand_info fmsh_spinand_table[] = { SPINAND_INFO_OP_VARIANTS(&read_cache_variants, &write_cache_variants, &update_cache_variants), - SPINAND_HAS_QE_BIT, + 0, SPINAND_ECCINFO(&fm25s01a_ooblayout, NULL)), }; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e95e593cd12d..5abef8a3b775 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2120,7 +2120,7 @@ skip_mac_set: /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { - if (netif_carrier_ok(slave_dev)) { + if (netif_running(slave_dev) && netif_carrier_ok(slave_dev)) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, @@ -2665,7 +2665,8 @@ static int bond_miimon_inspect(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); - link_state = netif_carrier_ok(slave->dev); + link_state = netif_running(slave->dev) && + netif_carrier_ok(slave->dev); switch (slave->link) { case BOND_LINK_UP: diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 495a87f2ea7c..384499c869b8 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -225,13 +225,6 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { NULL, -1, 0}, }; -static const struct bond_opt_value bond_actor_port_prio_tbl[] = { - { "minval", 0, BOND_VALFLAG_MIN}, - { "maxval", 65535, BOND_VALFLAG_MAX}, - { "default", 255, BOND_VALFLAG_DEFAULT}, - { NULL, -1, 0}, -}; - static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, @@ -497,7 +490,7 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .id = BOND_OPT_ACTOR_PORT_PRIO, .name = "actor_port_prio", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), - .values = bond_actor_port_prio_tbl, + .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_actor_port_prio_set, }, [BOND_OPT_AD_ACTOR_SYSTEM] = { diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index 45d36adb51b7..4c0d7d26df9f 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -709,6 +709,11 @@ static void rcar_canfd_set_bit_reg(void __iomem *addr, u32 val) rcar_canfd_update(val, val, addr); } +static void rcar_canfd_clear_bit_reg(void __iomem *addr, u32 val) +{ + rcar_canfd_update(val, 0, addr); +} + static void rcar_canfd_update_bit_reg(void __iomem *addr, u32 mask, u32 val) { rcar_canfd_update(mask, val, addr); @@ -755,25 +760,6 @@ static void rcar_canfd_set_rnc(struct rcar_canfd_global *gpriv, unsigned int ch, rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLCFG(w), rnc); } -static void rcar_canfd_set_mode(struct rcar_canfd_global *gpriv) -{ - if (gpriv->info->ch_interface_mode) { - u32 ch, val = gpriv->fdmode ? RCANFD_GEN4_FDCFG_FDOE - : RCANFD_GEN4_FDCFG_CLOE; - - for_each_set_bit(ch, &gpriv->channels_mask, - gpriv->info->max_channels) - rcar_canfd_set_bit_reg(&gpriv->fcbase[ch].cfdcfg, val); - } else { - if (gpriv->fdmode) - rcar_canfd_set_bit(gpriv->base, RCANFD_GRMCFG, - RCANFD_GRMCFG_RCMC); - else - rcar_canfd_clear_bit(gpriv->base, RCANFD_GRMCFG, - RCANFD_GRMCFG_RCMC); - } -} - static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) { struct device *dev = &gpriv->pdev->dev; @@ -806,6 +792,16 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) /* Reset Global error flags */ rcar_canfd_write(gpriv->base, RCANFD_GERFL, 0x0); + /* Set the controller into appropriate mode */ + if (!gpriv->info->ch_interface_mode) { + if (gpriv->fdmode) + rcar_canfd_set_bit(gpriv->base, RCANFD_GRMCFG, + RCANFD_GRMCFG_RCMC); + else + rcar_canfd_clear_bit(gpriv->base, RCANFD_GRMCFG, + RCANFD_GRMCFG_RCMC); + } + /* Transition all Channels to reset mode */ for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) { rcar_canfd_clear_bit(gpriv->base, @@ -823,10 +819,23 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) dev_dbg(dev, "channel %u reset failed\n", ch); return err; } - } - /* Set the controller into appropriate mode */ - rcar_canfd_set_mode(gpriv); + /* Set the controller into appropriate mode */ + if (gpriv->info->ch_interface_mode) { + /* Do not set CLOE and FDOE simultaneously */ + if (!gpriv->fdmode) { + rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg, + RCANFD_GEN4_FDCFG_FDOE); + rcar_canfd_set_bit_reg(&gpriv->fcbase[ch].cfdcfg, + RCANFD_GEN4_FDCFG_CLOE); + } else { + rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg, + RCANFD_GEN4_FDCFG_FDOE); + rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg, + RCANFD_GEN4_FDCFG_CLOE); + } + } + } return 0; } diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 4d245857ef1c..83476af8adb5 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -548,8 +548,8 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id) if (priv->read_reg(priv, SJA1000_IER) == IRQ_OFF) goto out; - while ((isrc = priv->read_reg(priv, SJA1000_IR)) && - (n < SJA1000_MAX_IRQ)) { + while ((n < SJA1000_MAX_IRQ) && + (isrc = priv->read_reg(priv, SJA1000_IR))) { status = priv->read_reg(priv, SJA1000_SR); /* check for absent controller due to hw unplug */ diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c index 53bfd873de9b..0a7ba0942839 100644 --- a/drivers/net/can/sun4i_can.c +++ b/drivers/net/can/sun4i_can.c @@ -657,8 +657,8 @@ static irqreturn_t sun4i_can_interrupt(int irq, void *dev_id) u8 isrc, status; int n = 0; - while ((isrc = readl(priv->base + SUN4I_REG_INT_ADDR)) && - (n < SUN4I_CAN_MAX_IRQ)) { + while ((n < SUN4I_CAN_MAX_IRQ) && + (isrc = readl(priv->base + SUN4I_REG_INT_ADDR))) { n++; status = readl(priv->base + SUN4I_REG_STA_ADDR); diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 69b8d6da651b..8d8a610f9144 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -261,14 +261,21 @@ struct canfd_quirk { u8 quirk; } __packed; +/* struct gs_host_frame::echo_id == GS_HOST_FRAME_ECHO_ID_RX indicates + * a regular RX'ed CAN frame + */ +#define GS_HOST_FRAME_ECHO_ID_RX 0xffffffff + struct gs_host_frame { - u32 echo_id; - __le32 can_id; + struct_group(header, + u32 echo_id; + __le32 can_id; - u8 can_dlc; - u8 channel; - u8 flags; - u8 reserved; + u8 can_dlc; + u8 channel; + u8 flags; + u8 reserved; + ); union { DECLARE_FLEX_ARRAY(struct classic_can, classic_can); @@ -568,6 +575,37 @@ gs_usb_get_echo_skb(struct gs_can *dev, struct sk_buff *skb, return len; } +static unsigned int +gs_usb_get_minimum_rx_length(const struct gs_can *dev, const struct gs_host_frame *hf, + unsigned int *data_length_p) +{ + unsigned int minimum_length, data_length = 0; + + if (hf->flags & GS_CAN_FLAG_FD) { + if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX) + data_length = can_fd_dlc2len(hf->can_dlc); + + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + /* timestamp follows data field of max size */ + minimum_length = struct_size(hf, canfd_ts, 1); + else + minimum_length = sizeof(hf->header) + data_length; + } else { + if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX && + !(hf->can_id & cpu_to_le32(CAN_RTR_FLAG))) + data_length = can_cc_dlc2len(hf->can_dlc); + + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + /* timestamp follows data field of max size */ + minimum_length = struct_size(hf, classic_can_ts, 1); + else + minimum_length = sizeof(hf->header) + data_length; + } + + *data_length_p = data_length; + return minimum_length; +} + static void gs_usb_receive_bulk_callback(struct urb *urb) { struct gs_usb *parent = urb->context; @@ -576,6 +614,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) int rc; struct net_device_stats *stats; struct gs_host_frame *hf = urb->transfer_buffer; + unsigned int minimum_length, data_length; struct gs_tx_context *txc; struct can_frame *cf; struct canfd_frame *cfd; @@ -594,6 +633,15 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) return; } + minimum_length = sizeof(hf->header); + if (urb->actual_length < minimum_length) { + dev_err_ratelimited(&parent->udev->dev, + "short read (actual_length=%u, minimum_length=%u)\n", + urb->actual_length, minimum_length); + + goto resubmit_urb; + } + /* device reports out of range channel id */ if (hf->channel >= parent->channel_cnt) goto device_detach; @@ -609,20 +657,33 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) if (!netif_running(netdev)) goto resubmit_urb; - if (hf->echo_id == -1) { /* normal rx */ + minimum_length = gs_usb_get_minimum_rx_length(dev, hf, &data_length); + if (urb->actual_length < minimum_length) { + stats->rx_errors++; + stats->rx_length_errors++; + + if (net_ratelimit()) + netdev_err(netdev, + "short read (actual_length=%u, minimum_length=%u)\n", + urb->actual_length, minimum_length); + + goto resubmit_urb; + } + + if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX) { /* normal rx */ if (hf->flags & GS_CAN_FLAG_FD) { skb = alloc_canfd_skb(netdev, &cfd); if (!skb) return; cfd->can_id = le32_to_cpu(hf->can_id); - cfd->len = can_fd_dlc2len(hf->can_dlc); + cfd->len = data_length; if (hf->flags & GS_CAN_FLAG_BRS) cfd->flags |= CANFD_BRS; if (hf->flags & GS_CAN_FLAG_ESI) cfd->flags |= CANFD_ESI; - memcpy(cfd->data, hf->canfd->data, cfd->len); + memcpy(cfd->data, hf->canfd->data, data_length); } else { skb = alloc_can_skb(netdev, &cf); if (!skb) @@ -631,7 +692,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) cf->can_id = le32_to_cpu(hf->can_id); can_frame_set_cc_len(cf, hf->can_dlc, dev->can.ctrlmode); - memcpy(cf->data, hf->classic_can->data, 8); + memcpy(cf->data, hf->classic_can->data, data_length); /* ERROR frames tell us information about the controller */ if (le32_to_cpu(hf->can_id) & CAN_ERR_FLAG) @@ -687,7 +748,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) resubmit_urb: usb_fill_bulk_urb(urb, parent->udev, parent->pipe_in, - hf, dev->parent->hf_size_rx, + hf, parent->hf_size_rx, gs_usb_receive_bulk_callback, parent); rc = usb_submit_urb(urb, GFP_ATOMIC); @@ -750,8 +811,21 @@ static void gs_usb_xmit_callback(struct urb *urb) struct gs_can *dev = txc->dev; struct net_device *netdev = dev->netdev; - if (urb->status) - netdev_info(netdev, "usb xmit fail %u\n", txc->echo_id); + if (!urb->status) + return; + + if (urb->status != -ESHUTDOWN && net_ratelimit()) + netdev_info(netdev, "failed to xmit URB %u: %pe\n", + txc->echo_id, ERR_PTR(urb->status)); + + netdev->stats.tx_dropped++; + netdev->stats.tx_errors++; + + can_free_echo_skb(netdev, txc->echo_id, NULL); + gs_free_tx_context(txc); + atomic_dec(&dev->active_tx_urbs); + + netif_wake_queue(netdev); } static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c index c29828a94ad0..1167d38344f1 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c @@ -685,7 +685,7 @@ static int kvaser_usb_leaf_wait_cmd(const struct kvaser_usb *dev, u8 id, * for further details. */ if (tmp->len == 0) { - pos = round_up(pos, + pos = round_up(pos + 1, le16_to_cpu (dev->bulk_in->wMaxPacketSize)); continue; @@ -1732,7 +1732,7 @@ static void kvaser_usb_leaf_read_bulk_callback(struct kvaser_usb *dev, * number of events in case of a heavy rx load on the bus. */ if (cmd->len == 0) { - pos = round_up(pos, le16_to_cpu + pos = round_up(pos + 1, le16_to_cpu (dev->bulk_in->wMaxPacketSize)); continue; } diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 2f846381d5a7..eb767edc4c13 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -371,11 +371,11 @@ static void b53_set_forwarding(struct b53_device *dev, int enable) * frames should be flooded or not. */ b53_read8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, &mgmt); - mgmt |= B53_UC_FWD_EN | B53_MC_FWD_EN | B53_IPMC_FWD_EN; + mgmt |= B53_UC_FWD_EN | B53_MC_FWD_EN | B53_IP_MC; b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt); } else { b53_read8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, &mgmt); - mgmt |= B53_IP_MCAST_25; + mgmt |= B53_IP_MC; b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt); } } @@ -1372,6 +1372,10 @@ static void b53_force_port_config(struct b53_device *dev, int port, else reg &= ~PORT_OVERRIDE_FULL_DUPLEX; + reg &= ~(0x3 << GMII_PO_SPEED_S); + if (is5301x(dev) || is58xx(dev)) + reg &= ~PORT_OVERRIDE_SPEED_2000M; + switch (speed) { case 2000: reg |= PORT_OVERRIDE_SPEED_2000M; @@ -1390,6 +1394,11 @@ static void b53_force_port_config(struct b53_device *dev, int port, return; } + if (is5325(dev)) + reg &= ~PORT_OVERRIDE_LP_FLOW_25; + else + reg &= ~(PORT_OVERRIDE_RX_FLOW | PORT_OVERRIDE_TX_FLOW); + if (rx_pause) { if (is5325(dev)) reg |= PORT_OVERRIDE_LP_FLOW_25; @@ -1593,8 +1602,11 @@ static void b53_phylink_mac_link_down(struct phylink_config *config, struct b53_device *dev = dp->ds->priv; int port = dp->index; - if (mode == MLO_AN_PHY) + if (mode == MLO_AN_PHY) { + if (is63xx(dev) && in_range(port, B53_63XX_RGMII0, 4)) + b53_force_link(dev, port, false); return; + } if (mode == MLO_AN_FIXED) { b53_force_link(dev, port, false); @@ -1622,6 +1634,13 @@ static void b53_phylink_mac_link_up(struct phylink_config *config, if (mode == MLO_AN_PHY) { /* Re-negotiate EEE if it was enabled already */ p->eee_enabled = b53_eee_init(ds, port, phydev); + + if (is63xx(dev) && in_range(port, B53_63XX_RGMII0, 4)) { + b53_force_port_config(dev, port, speed, duplex, + tx_pause, rx_pause); + b53_force_link(dev, port, true); + } + return; } @@ -2018,7 +2037,7 @@ static int b53_arl_search_wait(struct b53_device *dev) do { b53_read8(dev, B53_ARLIO_PAGE, offset, ®); if (!(reg & ARL_SRCH_STDN)) - return 0; + return -ENOENT; if (reg & ARL_SRCH_VLID) return 0; @@ -2068,13 +2087,16 @@ static int b53_fdb_copy(int port, const struct b53_arl_entry *ent, int b53_fdb_dump(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data) { + unsigned int count = 0, results_per_hit = 1; struct b53_device *priv = ds->priv; struct b53_arl_entry results[2]; - unsigned int count = 0; u8 offset; int ret; u8 reg; + if (priv->num_arl_bins > 2) + results_per_hit = 2; + mutex_lock(&priv->arl_mutex); if (is5325(priv) || is5365(priv)) @@ -2096,7 +2118,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, if (ret) break; - if (priv->num_arl_bins > 2) { + if (results_per_hit == 2) { b53_arl_search_rd(priv, 1, &results[1]); ret = b53_fdb_copy(port, &results[1], cb, data); if (ret) @@ -2106,7 +2128,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, break; } - } while (count++ < b53_max_arl_entries(priv) / 2); + } while (count++ < b53_max_arl_entries(priv) / results_per_hit); mutex_unlock(&priv->arl_mutex); diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h index 309fe0e46dad..8ce1ce72e938 100644 --- a/drivers/net/dsa/b53/b53_regs.h +++ b/drivers/net/dsa/b53/b53_regs.h @@ -111,8 +111,7 @@ /* IP Multicast control (8 bit) */ #define B53_IP_MULTICAST_CTRL 0x21 -#define B53_IP_MCAST_25 BIT(0) -#define B53_IPMC_FWD_EN BIT(1) +#define B53_IP_MC BIT(0) #define B53_UC_FWD_EN BIT(6) #define B53_MC_FWD_EN BIT(7) diff --git a/drivers/net/dsa/hirschmann/hellcreek_ptp.c b/drivers/net/dsa/hirschmann/hellcreek_ptp.c index bfe21f9f7dcd..cb23bea9c21b 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_ptp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_ptp.c @@ -376,8 +376,18 @@ static int hellcreek_led_setup(struct hellcreek *hellcreek) hellcreek_set_brightness(hellcreek, STATUS_OUT_IS_GM, 1); /* Register both leds */ - led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good); - led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm); + ret = led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good); + if (ret) { + dev_err(hellcreek->dev, "Failed to register sync_good LED\n"); + goto out; + } + + ret = led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm); + if (ret) { + dev_err(hellcreek->dev, "Failed to register is_gm LED\n"); + led_classdev_unregister(&hellcreek->led_sync_good); + goto out; + } ret = 0; diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c index d747ea1c41a7..5df8f153d511 100644 --- a/drivers/net/dsa/microchip/ksz9477.c +++ b/drivers/net/dsa/microchip/ksz9477.c @@ -1355,9 +1355,15 @@ void ksz9477_config_cpu_port(struct dsa_switch *ds) } } +#define RESV_MCAST_CNT 8 + +static u8 reserved_mcast_map[RESV_MCAST_CNT] = { 0, 1, 3, 16, 32, 33, 2, 17 }; + int ksz9477_enable_stp_addr(struct ksz_device *dev) { + u8 i, ports, update; const u32 *masks; + bool override; u32 data; int ret; @@ -1366,23 +1372,87 @@ int ksz9477_enable_stp_addr(struct ksz_device *dev) /* Enable Reserved multicast table */ ksz_cfg(dev, REG_SW_LUE_CTRL_0, SW_RESV_MCAST_ENABLE, true); - /* Set the Override bit for forwarding BPDU packet to CPU */ - ret = ksz_write32(dev, REG_SW_ALU_VAL_B, - ALU_V_OVERRIDE | BIT(dev->cpu_port)); - if (ret < 0) - return ret; + /* The reserved multicast address table has 8 entries. Each entry has + * a default value of which port to forward. It is assumed the host + * port is the last port in most of the switches, but that is not the + * case for KSZ9477 or maybe KSZ9897. For LAN937X family the default + * port is port 5, the first RGMII port. It is okay for LAN9370, a + * 5-port switch, but may not be correct for the other 8-port + * versions. It is necessary to update the whole table to forward to + * the right ports. + * Furthermore PTP messages can use a reserved multicast address and + * the host will not receive them if this table is not correct. + */ + for (i = 0; i < RESV_MCAST_CNT; i++) { + data = reserved_mcast_map[i] << + dev->info->shifts[ALU_STAT_INDEX]; + data |= ALU_STAT_START | + masks[ALU_STAT_DIRECT] | + masks[ALU_RESV_MCAST_ADDR] | + masks[ALU_STAT_READ]; + ret = ksz_write32(dev, REG_SW_ALU_STAT_CTRL__4, data); + if (ret < 0) + return ret; - data = ALU_STAT_START | ALU_RESV_MCAST_ADDR | masks[ALU_STAT_WRITE]; + /* wait to be finished */ + ret = ksz9477_wait_alu_sta_ready(dev); + if (ret < 0) + return ret; - ret = ksz_write32(dev, REG_SW_ALU_STAT_CTRL__4, data); - if (ret < 0) - return ret; + ret = ksz_read32(dev, REG_SW_ALU_VAL_B, &data); + if (ret < 0) + return ret; - /* wait to be finished */ - ret = ksz9477_wait_alu_sta_ready(dev); - if (ret < 0) { - dev_err(dev->dev, "Failed to update Reserved Multicast table\n"); - return ret; + override = false; + ports = data & dev->port_mask; + switch (i) { + case 0: + case 6: + /* Change the host port. */ + update = BIT(dev->cpu_port); + override = true; + break; + case 2: + /* Change the host port. */ + update = BIT(dev->cpu_port); + break; + case 4: + case 5: + case 7: + /* Skip the host port. */ + update = dev->port_mask & ~BIT(dev->cpu_port); + break; + default: + update = ports; + break; + } + if (update != ports || override) { + data &= ~dev->port_mask; + data |= update; + /* Set Override bit to receive frame even when port is + * closed. + */ + if (override) + data |= ALU_V_OVERRIDE; + ret = ksz_write32(dev, REG_SW_ALU_VAL_B, data); + if (ret < 0) + return ret; + + data = reserved_mcast_map[i] << + dev->info->shifts[ALU_STAT_INDEX]; + data |= ALU_STAT_START | + masks[ALU_STAT_DIRECT] | + masks[ALU_RESV_MCAST_ADDR] | + masks[ALU_STAT_WRITE]; + ret = ksz_write32(dev, REG_SW_ALU_STAT_CTRL__4, data); + if (ret < 0) + return ret; + + /* wait to be finished */ + ret = ksz9477_wait_alu_sta_ready(dev); + if (ret < 0) + return ret; + } } return 0; diff --git a/drivers/net/dsa/microchip/ksz9477_reg.h b/drivers/net/dsa/microchip/ksz9477_reg.h index ff579920078e..61ea11e3338e 100644 --- a/drivers/net/dsa/microchip/ksz9477_reg.h +++ b/drivers/net/dsa/microchip/ksz9477_reg.h @@ -2,7 +2,7 @@ /* * Microchip KSZ9477 register definitions * - * Copyright (C) 2017-2024 Microchip Technology Inc. + * Copyright (C) 2017-2025 Microchip Technology Inc. */ #ifndef __KSZ9477_REGS_H @@ -397,7 +397,6 @@ #define ALU_RESV_MCAST_INDEX_M (BIT(6) - 1) #define ALU_STAT_START BIT(7) -#define ALU_RESV_MCAST_ADDR BIT(1) #define REG_SW_ALU_VAL_A 0x0420 diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index a962055bfdbd..0c10351fe5eb 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -808,6 +808,8 @@ static const u16 ksz9477_regs[] = { static const u32 ksz9477_masks[] = { [ALU_STAT_WRITE] = 0, [ALU_STAT_READ] = 1, + [ALU_STAT_DIRECT] = 0, + [ALU_RESV_MCAST_ADDR] = BIT(1), [P_MII_TX_FLOW_CTRL] = BIT(5), [P_MII_RX_FLOW_CTRL] = BIT(3), }; @@ -835,6 +837,8 @@ static const u8 ksz9477_xmii_ctrl1[] = { static const u32 lan937x_masks[] = { [ALU_STAT_WRITE] = 1, [ALU_STAT_READ] = 2, + [ALU_STAT_DIRECT] = BIT(3), + [ALU_RESV_MCAST_ADDR] = BIT(2), [P_MII_TX_FLOW_CTRL] = BIT(5), [P_MII_RX_FLOW_CTRL] = BIT(3), }; @@ -2583,8 +2587,8 @@ static int ksz_irq_phy_setup(struct ksz_device *dev) irq = irq_find_mapping(dev->ports[port].pirq.domain, PORT_SRC_PHY_INT); - if (irq < 0) { - ret = irq; + if (!irq) { + ret = -EINVAL; goto out; } ds->user_mii_bus->irq[phy] = irq; @@ -2948,8 +2952,8 @@ static int ksz_pirq_setup(struct ksz_device *dev, u8 p) snprintf(pirq->name, sizeof(pirq->name), "port_irq-%d", p); pirq->irq_num = irq_find_mapping(dev->girq.domain, p); - if (pirq->irq_num < 0) - return pirq->irq_num; + if (!pirq->irq_num) + return -EINVAL; return ksz_irq_common_setup(dev, pirq); } @@ -3034,12 +3038,12 @@ static int ksz_setup(struct dsa_switch *ds) dsa_switch_for_each_user_port(dp, dev->ds) { ret = ksz_pirq_setup(dev, dp->index); if (ret) - goto out_girq; + goto port_release; if (dev->info->ptp_capable) { ret = ksz_ptp_irq_setup(ds, dp->index); if (ret) - goto out_pirq; + goto pirq_release; } } } @@ -3049,7 +3053,7 @@ static int ksz_setup(struct dsa_switch *ds) if (ret) { dev_err(dev->dev, "Failed to register PTP clock: %d\n", ret); - goto out_ptpirq; + goto port_release; } } @@ -3072,17 +3076,16 @@ static int ksz_setup(struct dsa_switch *ds) out_ptp_clock_unregister: if (dev->info->ptp_capable) ksz_ptp_clock_unregister(ds); -out_ptpirq: - if (dev->irq > 0 && dev->info->ptp_capable) - dsa_switch_for_each_user_port(dp, dev->ds) - ksz_ptp_irq_free(ds, dp->index); -out_pirq: - if (dev->irq > 0) - dsa_switch_for_each_user_port(dp, dev->ds) +port_release: + if (dev->irq > 0) { + dsa_switch_for_each_user_port_continue_reverse(dp, dev->ds) { + if (dev->info->ptp_capable) + ksz_ptp_irq_free(ds, dp->index); +pirq_release: ksz_irq_free(&dev->ports[dp->index].pirq); -out_girq: - if (dev->irq > 0) + } ksz_irq_free(&dev->girq); + } return ret; } diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index a1eb39771bb9..c65188cd3c0a 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -294,6 +294,8 @@ enum ksz_masks { DYNAMIC_MAC_TABLE_TIMESTAMP, ALU_STAT_WRITE, ALU_STAT_READ, + ALU_STAT_DIRECT, + ALU_RESV_MCAST_ADDR, P_MII_TX_FLOW_CTRL, P_MII_RX_FLOW_CTRL, }; diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c index 35fc21b1ee48..997e4a76d0a6 100644 --- a/drivers/net/dsa/microchip/ksz_ptp.c +++ b/drivers/net/dsa/microchip/ksz_ptp.c @@ -1093,19 +1093,19 @@ static int ksz_ptp_msg_irq_setup(struct ksz_port *port, u8 n) static const char * const name[] = {"pdresp-msg", "xdreq-msg", "sync-msg"}; const struct ksz_dev_ops *ops = port->ksz_dev->dev_ops; + struct ksz_irq *ptpirq = &port->ptpirq; struct ksz_ptp_irq *ptpmsg_irq; ptpmsg_irq = &port->ptpmsg_irq[n]; + ptpmsg_irq->num = irq_create_mapping(ptpirq->domain, n); + if (!ptpmsg_irq->num) + return -EINVAL; ptpmsg_irq->port = port; ptpmsg_irq->ts_reg = ops->get_port_addr(port->num, ts_reg[n]); strscpy(ptpmsg_irq->name, name[n]); - ptpmsg_irq->num = irq_find_mapping(port->ptpirq.domain, n); - if (ptpmsg_irq->num < 0) - return ptpmsg_irq->num; - return request_threaded_irq(ptpmsg_irq->num, NULL, ksz_ptp_msg_thread_fn, IRQF_ONESHOT, ptpmsg_irq->name, ptpmsg_irq); @@ -1135,12 +1135,9 @@ int ksz_ptp_irq_setup(struct dsa_switch *ds, u8 p) if (!ptpirq->domain) return -ENOMEM; - for (irq = 0; irq < ptpirq->nirqs; irq++) - irq_create_mapping(ptpirq->domain, irq); - ptpirq->irq_num = irq_find_mapping(port->pirq.domain, PORT_SRC_PTP_INT); - if (ptpirq->irq_num < 0) { - ret = ptpirq->irq_num; + if (!ptpirq->irq_num) { + ret = -EINVAL; goto out; } @@ -1159,12 +1156,11 @@ int ksz_ptp_irq_setup(struct dsa_switch *ds, u8 p) out_ptp_msg: free_irq(ptpirq->irq_num, ptpirq); - while (irq--) + while (irq--) { free_irq(port->ptpmsg_irq[irq].num, &port->ptpmsg_irq[irq]); -out: - for (irq = 0; irq < ptpirq->nirqs; irq++) irq_dispose_mapping(port->ptpmsg_irq[irq].num); - + } +out: irq_domain_remove(ptpirq->domain); return ret; diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c index b1ae3b9de3d1..5a1496fff445 100644 --- a/drivers/net/dsa/microchip/lan937x_main.c +++ b/drivers/net/dsa/microchip/lan937x_main.c @@ -540,6 +540,7 @@ static void lan937x_set_tune_adj(struct ksz_device *dev, int port, ksz_pread16(dev, port, reg, &data16); /* Update tune Adjust */ + data16 &= ~PORT_TUNE_ADJ; data16 |= FIELD_PREP(PORT_TUNE_ADJ, val); ksz_pwrite16(dev, port, reg, data16); diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index f674c400f05b..aa2145cf29a6 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1302,14 +1302,7 @@ static int sja1105_set_port_speed(struct sja1105_private *priv, int port, * table, since this will be used for the clocking setup, and we no * longer need to store it in the static config (already told hardware * we want auto during upload phase). - * Actually for the SGMII port, the MAC is fixed at 1 Gbps and - * we need to configure the PCS only (if even that). */ - if (priv->phy_mode[port] == PHY_INTERFACE_MODE_SGMII) - speed = priv->info->port_speed[SJA1105_SPEED_1000MBPS]; - else if (priv->phy_mode[port] == PHY_INTERFACE_MODE_2500BASEX) - speed = priv->info->port_speed[SJA1105_SPEED_2500MBPS]; - mac[port].speed = speed; return 0; diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 691361b25407..c0e17035db18 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -282,7 +282,7 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, if (!airoha_is_valid_gdm_port(eth, port)) return -EINVAL; - if (dsa_port >= 0) + if (dsa_port >= 0 || eth->ports[1]) pse_port = port->id == 4 ? FE_PSE_PORT_GDM4 : port->id; else diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c index 1921741f7311..18b08277d2e1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c @@ -15,6 +15,7 @@ #include "aq_hw.h" #include "aq_nic.h" +#include "hw_atl/hw_atl_llh.h" void aq_hw_write_reg_bit(struct aq_hw_s *aq_hw, u32 addr, u32 msk, u32 shift, u32 val) @@ -81,6 +82,27 @@ void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value) lo_hi_writeq(value, hw->mmio + reg); } +int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw) +{ + int err; + u32 val; + + /* Invalidate Descriptor Cache to prevent writing to the cached + * descriptors and to the data pointer of those descriptors + */ + hw_atl_rdm_rx_dma_desc_cache_init_tgl(hw); + + err = aq_hw_err_from_flags(hw); + if (err) + goto err_exit; + + readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get, + hw, val, val == 1, 1000U, 10000U); + +err_exit: + return err; +} + int aq_hw_err_from_flags(struct aq_hw_s *hw) { int err = 0; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h index ffa6e4067c21..d89c63d88e4a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h @@ -35,6 +35,7 @@ u32 aq_hw_read_reg(struct aq_hw_s *hw, u32 reg); void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value); u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg); void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value); +int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw); int aq_hw_err_from_flags(struct aq_hw_s *hw); int aq_hw_num_tcs(struct aq_hw_s *hw); int aq_hw_q_per_tc(struct aq_hw_s *hw); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index f21de0c21e52..d23d23bed39f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -547,6 +547,11 @@ static int __aq_ring_rx_clean(struct aq_ring_s *self, struct napi_struct *napi, if (!buff->is_eop) { unsigned int frag_cnt = 0U; + + /* There will be an extra fragment */ + if (buff->len > AQ_CFG_RX_HDR_SIZE) + frag_cnt++; + buff_ = buff; do { bool is_rsc_completed = true; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 493432d036b9..c7895bfb2ecf 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -1198,26 +1198,9 @@ static int hw_atl_b0_hw_interrupt_moderation_set(struct aq_hw_s *self) static int hw_atl_b0_hw_stop(struct aq_hw_s *self) { - int err; - u32 val; - hw_atl_b0_hw_irq_disable(self, HW_ATL_B0_INT_MASK); - /* Invalidate Descriptor Cache to prevent writing to the cached - * descriptors and to the data pointer of those descriptors - */ - hw_atl_rdm_rx_dma_desc_cache_init_tgl(self); - - err = aq_hw_err_from_flags(self); - - if (err) - goto err_exit; - - readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get, - self, val, val == 1, 1000U, 10000U); - -err_exit: - return err; + return aq_hw_invalidate_descriptor_cache(self); } int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c index b0ed572e88c6..0ce9caae8799 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c @@ -759,7 +759,7 @@ static int hw_atl2_hw_stop(struct aq_hw_s *self) { hw_atl_b0_hw_irq_disable(self, HW_ATL2_INT_MASK); - return 0; + return aq_hw_invalidate_descriptor_cache(self); } static struct aq_stats_s *hw_atl2_utils_get_hw_stats(struct aq_hw_s *self) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3fc33b1b4dfb..a625e7c311dd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12439,7 +12439,7 @@ static int bnxt_try_recover_fw(struct bnxt *bp) return -ENODEV; } -static void bnxt_clear_reservations(struct bnxt *bp, bool fw_reset) +void bnxt_clear_reservations(struct bnxt *bp, bool fw_reset) { struct bnxt_hw_resc *hw_resc = &bp->hw_resc; @@ -16892,6 +16892,10 @@ static void bnxt_shutdown(struct pci_dev *pdev) if (netif_running(dev)) netif_close(dev); + if (bnxt_hwrm_func_drv_unrgtr(bp)) { + pcie_flr(pdev); + goto shutdown_exit; + } bnxt_ptp_clear(bp); bnxt_clear_int_mode(bp); pci_disable_device(pdev); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 741b2d854789..3613a172483a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2149,7 +2149,7 @@ struct bnxt_bs_trace_info { static inline void bnxt_bs_trace_check_wrap(struct bnxt_bs_trace_info *bs_trace, u32 offset) { - if (!bs_trace->wrapped && + if (!bs_trace->wrapped && bs_trace->magic_byte && *bs_trace->magic_byte != BNXT_TRACE_BUF_MAGIC_BYTE) bs_trace->wrapped = 1; bs_trace->last_offset = offset; @@ -2941,6 +2941,7 @@ void bnxt_report_link(struct bnxt *bp); int bnxt_update_link(struct bnxt *bp, bool chng_link_state); int bnxt_hwrm_set_pause(struct bnxt *); int bnxt_hwrm_set_link_setting(struct bnxt *, bool, bool); +void bnxt_clear_reservations(struct bnxt *bp, bool fw_reset); int bnxt_cancel_reservations(struct bnxt *bp, bool fw_reset); int bnxt_hwrm_alloc_wol_fltr(struct bnxt *bp); int bnxt_hwrm_free_wol_fltr(struct bnxt *bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c index 0181ab1f2dfd..ccb8b509662d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c @@ -333,13 +333,14 @@ static void bnxt_fill_drv_seg_record(struct bnxt *bp, u32 offset = 0; int rc = 0; + record->max_entries = cpu_to_le32(ctxm->max_entries); + record->entry_size = cpu_to_le32(ctxm->entry_size); + rc = bnxt_dbg_hwrm_log_buffer_flush(bp, type, 0, &offset); if (rc) return; bnxt_bs_trace_check_wrap(bs_trace, offset); - record->max_entries = cpu_to_le32(ctxm->max_entries); - record->entry_size = cpu_to_le32(ctxm->entry_size); record->offset = cpu_to_le32(bs_trace->last_offset); record->wrapped = bs_trace->wrapped; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 02961d93ed35..67ca02d84c97 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -461,7 +461,7 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change, rtnl_unlock(); break; } - bnxt_cancel_reservations(bp, false); + bnxt_clear_reservations(bp, false); bnxt_free_ctx_mem(bp, false); break; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index db81cf6d5289..0abaa2bbe357 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -1051,9 +1051,9 @@ static void bnxt_ptp_free(struct bnxt *bp) if (ptp->ptp_clock) { ptp_clock_unregister(ptp->ptp_clock); ptp->ptp_clock = NULL; - kfree(ptp->ptp_info.pin_config); - ptp->ptp_info.pin_config = NULL; } + kfree(ptp->ptp_info.pin_config); + ptp->ptp_info.pin_config = NULL; } int bnxt_ptp_init(struct bnxt *bp) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index cb004fd16252..5bb31c8fab39 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1296,7 +1296,8 @@ static void be_xmit_flush(struct be_adapter *adapter, struct be_tx_obj *txo) (adapter->bmc_filt_mask & BMC_FILT_MULTICAST) static bool be_send_pkt_to_bmc(struct be_adapter *adapter, - struct sk_buff **skb) + struct sk_buff **skb, + struct be_wrb_params *wrb_params) { struct ethhdr *eh = (struct ethhdr *)(*skb)->data; bool os2bmc = false; @@ -1360,7 +1361,7 @@ done: * to BMC, asic expects the vlan to be inline in the packet. */ if (os2bmc) - *skb = be_insert_vlan_in_pkt(adapter, *skb, NULL); + *skb = be_insert_vlan_in_pkt(adapter, *skb, wrb_params); return os2bmc; } @@ -1387,7 +1388,7 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) /* if os2bmc is enabled and if the pkt is destined to bmc, * enqueue the pkt a 2nd time with mgmt bit set. */ - if (be_send_pkt_to_bmc(adapter, &skb)) { + if (be_send_pkt_to_bmc(adapter, &skb, &wrb_params)) { BE_WRB_F_SET(wrb_params.features, OS2BMC, 1); wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params); if (unlikely(!wrb_cnt)) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 41e0d85d15da..abf1ef8e76c6 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -687,6 +687,7 @@ struct fec_enet_private { unsigned int reload_period; int pps_enable; unsigned int next_counter; + bool perout_enable; struct hrtimer perout_timer; u64 perout_stime; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1edcfaee6819..3222359ac15b 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1835,6 +1835,8 @@ fec_enet_rx_queue(struct net_device *ndev, u16 queue_id, int budget) ndev->stats.rx_packets++; pkt_len = fec16_to_cpu(bdp->cbd_datlen); ndev->stats.rx_bytes += pkt_len; + if (fep->quirks & FEC_QUIRK_HAS_RACC) + ndev->stats.rx_bytes -= 2; index = fec_enet_get_bd_index(bdp, &rxq->bd); page = rxq->rx_skb_info[index].page; diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c index fa88b47d526c..4b7bad9a485d 100644 --- a/drivers/net/ethernet/freescale/fec_ptp.c +++ b/drivers/net/ethernet/freescale/fec_ptp.c @@ -128,6 +128,12 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable) spin_lock_irqsave(&fep->tmreg_lock, flags); + if (fep->perout_enable) { + spin_unlock_irqrestore(&fep->tmreg_lock, flags); + dev_err(&fep->pdev->dev, "PEROUT is running"); + return -EBUSY; + } + if (fep->pps_enable == enable) { spin_unlock_irqrestore(&fep->tmreg_lock, flags); return 0; @@ -243,6 +249,7 @@ static int fec_ptp_pps_perout(struct fec_enet_private *fep) * the FEC_TCCR register in time and missed the start time. */ if (fep->perout_stime < curr_time + 100 * NSEC_PER_MSEC) { + fep->perout_enable = false; dev_err(&fep->pdev->dev, "Current time is too close to the start time!\n"); spin_unlock_irqrestore(&fep->tmreg_lock, flags); return -1; @@ -497,7 +504,10 @@ static int fec_ptp_pps_disable(struct fec_enet_private *fep, uint channel) { unsigned long flags; + hrtimer_cancel(&fep->perout_timer); + spin_lock_irqsave(&fep->tmreg_lock, flags); + fep->perout_enable = false; writel(0, fep->hwp + FEC_TCSR(channel)); spin_unlock_irqrestore(&fep->tmreg_lock, flags); @@ -529,6 +539,8 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp, return ret; } else if (rq->type == PTP_CLK_REQ_PEROUT) { + u32 reload_period; + /* Reject requests with unsupported flags */ if (rq->perout.flags) return -EOPNOTSUPP; @@ -548,12 +560,14 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp, return -EOPNOTSUPP; } - fep->reload_period = div_u64(period_ns, 2); - if (on && fep->reload_period) { + reload_period = div_u64(period_ns, 2); + if (on && reload_period) { + u64 perout_stime; + /* Convert 1588 timestamp to ns*/ start_time.tv_sec = rq->perout.start.sec; start_time.tv_nsec = rq->perout.start.nsec; - fep->perout_stime = timespec64_to_ns(&start_time); + perout_stime = timespec64_to_ns(&start_time); mutex_lock(&fep->ptp_clk_mutex); if (!fep->ptp_clk_on) { @@ -562,18 +576,41 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp, return -EOPNOTSUPP; } spin_lock_irqsave(&fep->tmreg_lock, flags); + + if (fep->pps_enable) { + dev_err(&fep->pdev->dev, "PPS is running"); + ret = -EBUSY; + goto unlock; + } + + if (fep->perout_enable) { + dev_err(&fep->pdev->dev, + "PEROUT has been enabled\n"); + ret = -EBUSY; + goto unlock; + } + /* Read current timestamp */ curr_time = timecounter_read(&fep->tc); - spin_unlock_irqrestore(&fep->tmreg_lock, flags); - mutex_unlock(&fep->ptp_clk_mutex); + if (perout_stime <= curr_time) { + dev_err(&fep->pdev->dev, + "Start time must be greater than current time\n"); + ret = -EINVAL; + goto unlock; + } /* Calculate time difference */ - delta = fep->perout_stime - curr_time; + delta = perout_stime - curr_time; + fep->reload_period = reload_period; + fep->perout_stime = perout_stime; + fep->perout_enable = true; - if (fep->perout_stime <= curr_time) { - dev_err(&fep->pdev->dev, "Start time must larger than current time!\n"); - return -EINVAL; - } +unlock: + spin_unlock_irqrestore(&fep->tmreg_lock, flags); + mutex_unlock(&fep->ptp_clk_mutex); + + if (ret) + return ret; /* Because the timer counter of FEC only has 31-bits, correspondingly, * the time comparison register FEC_TCCR also only low 31 bits can be @@ -681,8 +718,11 @@ static irqreturn_t fec_pps_interrupt(int irq, void *dev_id) fep->next_counter = (fep->next_counter + fep->reload_period) & fep->cc.mask; - event.type = PTP_CLOCK_PPS; - ptp_clock_event(fep->ptp_clock, &event); + if (fep->pps_enable) { + event.type = PTP_CLOCK_PPS; + ptp_clock_event(fep->ptp_clock, &event); + } + return IRQ_HANDLED; } diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c index e96247c9d68d..a384a9ed4914 100644 --- a/drivers/net/ethernet/google/gve/gve_ptp.c +++ b/drivers/net/ethernet/google/gve/gve_ptp.c @@ -26,6 +26,19 @@ int gve_clock_nic_ts_read(struct gve_priv *priv) return 0; } +static int gve_ptp_gettimex64(struct ptp_clock_info *info, + struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + return -EOPNOTSUPP; +} + +static int gve_ptp_settime64(struct ptp_clock_info *info, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + static long gve_ptp_do_aux_work(struct ptp_clock_info *info) { const struct gve_ptp *ptp = container_of(info, struct gve_ptp, info); @@ -47,6 +60,8 @@ out: static const struct ptp_clock_info gve_ptp_caps = { .owner = THIS_MODULE, .name = "gve clock", + .gettimex64 = gve_ptp_gettimex64, + .settime64 = gve_ptp_settime64, .do_aux_work = gve_ptp_do_aux_work, }; diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index a563a94e2780..122ee23497e6 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -146,7 +146,7 @@ config IXGBE tristate "Intel(R) 10GbE PCI Express adapters support" depends on PCI depends on PTP_1588_CLOCK_OPTIONAL - select LIBIE_FWLOG + select LIBIE_FWLOG if DEBUG_FS select MDIO select NET_DEVLINK select PLDMFW @@ -298,7 +298,7 @@ config ICE select DIMLIB select LIBIE select LIBIE_ADMINQ - select LIBIE_FWLOG + select LIBIE_FWLOG if DEBUG_FS select NET_DEVLINK select PACKING select PLDMFW diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index fb0f6365a6d6..8ec0f7d0fceb 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -3246,7 +3246,7 @@ void ice_ptp_init(struct ice_pf *pf) err = ice_ptp_init_port(pf, &ptp->port); if (err) - goto err_exit; + goto err_clean_pf; /* Start the PHY timestamping block */ ice_ptp_reset_phy_timestamping(pf); @@ -3263,13 +3263,19 @@ void ice_ptp_init(struct ice_pf *pf) dev_info(ice_pf_to_dev(pf), "PTP init successful\n"); return; +err_clean_pf: + mutex_destroy(&ptp->port.ps_lock); + ice_ptp_cleanup_pf(pf); err_exit: /* If we registered a PTP clock, release it */ if (pf->ptp.clock) { ptp_clock_unregister(ptp->clock); pf->ptp.clock = NULL; } - ptp->state = ICE_PTP_ERROR; + /* Keep ICE_PTP_UNINIT state to avoid ambiguity at driver unload + * and to avoid duplicated resources release. + */ + ptp->state = ICE_PTP_UNINIT; dev_err(ice_pf_to_dev(pf), "PTP failed %d\n", err); } @@ -3282,9 +3288,19 @@ err_exit: */ void ice_ptp_release(struct ice_pf *pf) { - if (pf->ptp.state != ICE_PTP_READY) + if (pf->ptp.state == ICE_PTP_UNINIT) return; + if (pf->ptp.state != ICE_PTP_READY) { + mutex_destroy(&pf->ptp.port.ps_lock); + ice_ptp_cleanup_pf(pf); + if (pf->ptp.clock) { + ptp_clock_unregister(pf->ptp.clock); + pf->ptp.clock = NULL; + } + return; + } + pf->ptp.state = ICE_PTP_UNINIT; /* Disable timestamping for both Tx and Rx */ diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index 8c46481d2e1f..8cf4ff697572 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -63,6 +63,8 @@ destroy_wqs: destroy_workqueue(adapter->vc_event_wq); for (i = 0; i < adapter->max_vports; i++) { + if (!adapter->vport_config[i]) + continue; kfree(adapter->vport_config[i]->user_config.q_coalesce); kfree(adapter->vport_config[i]); adapter->vport_config[i] = NULL; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 14d275270123..dce4936708eb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -821,9 +821,7 @@ struct ixgbe_adapter { #ifdef CONFIG_IXGBE_HWMON struct hwmon_buff *ixgbe_hwmon_buff; #endif /* CONFIG_IXGBE_HWMON */ -#ifdef CONFIG_DEBUG_FS struct dentry *ixgbe_dbg_adapter; -#endif /*CONFIG_DEBUG_FS*/ u8 default_up; /* Bitmask indicating in use pools */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index aff17c37ddde..902d6abaa3ec 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1516,10 +1516,8 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, pool->xdp_cnt = numptrs; pool->xdp = devm_kcalloc(pfvf->dev, numptrs, sizeof(struct xdp_buff *), GFP_KERNEL); - if (IS_ERR(pool->xdp)) { - netdev_err(pfvf->netdev, "Creation of xsk pool failed\n"); - return PTR_ERR(pool->xdp); - } + if (!pool->xdp) + return -ENOMEM; } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index e9f319a9bdd6..60f7ab1d72e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -66,8 +66,8 @@ void mlx5_cq_tasklet_cb(struct tasklet_struct *t) tasklet_schedule(&ctx->task); } -static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, - struct mlx5_eqe *eqe) +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, + struct mlx5_eqe *eqe) { unsigned long flags; struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv; @@ -95,7 +95,15 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, if (schedule_tasklet) tasklet_schedule(&tasklet_ctx->task); } +EXPORT_SYMBOL(mlx5_add_cq_to_tasklet); +static void mlx5_core_cq_dummy_cb(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe) +{ + mlx5_core_err(cq->eq->core.dev, + "CQ default completion callback, CQ #%u\n", cq->cqn); +} + +#define MLX5_CQ_INIT_CMD_SN cpu_to_be32(2 << 28) /* Callers must verify outbox status in case of err */ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen) @@ -121,10 +129,19 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->arm_sn = 0; cq->eq = eq; cq->uid = MLX5_GET(create_cq_in, in, uid); + + /* Kernel CQs must set the arm_db address prior to calling + * this function, allowing for the proper value to be + * initialized. User CQs are responsible for their own + * initialization since they do not use the arm_db field. + */ + if (cq->arm_db) + *cq->arm_db = MLX5_CQ_INIT_CMD_SN; + refcount_set(&cq->refcount, 1); init_completion(&cq->free); if (!cq->comp) - cq->comp = mlx5_add_cq_to_tasklet; + cq->comp = mlx5_core_cq_dummy_cb; /* assuming CQ will be deleted before the EQ */ cq->tasklet_ctx.priv = &eq->tasklet_ctx; INIT_LIST_HEAD(&cq->tasklet_ctx.list); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index fceea83abbd7..887adf4807d1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -541,7 +541,7 @@ static int mlx5_devlink_num_doorbells_validate(struct devlink *devlink, u32 id, max_num_channels = mlx5e_get_max_num_channels(mdev); if (val32 > max_num_channels) { NL_SET_ERR_MSG_FMT_MOD(extack, - "Requested num_doorbells (%u) exceeds maximum number of channels (%u)", + "Requested num_doorbells (%u) exceeds max number of channels (%u)", val32, max_num_channels); return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 14e3207b14e7..a163f81f07c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -634,7 +634,10 @@ struct mlx5e_dma_info { struct mlx5e_shampo_hd { struct mlx5e_frag_page *pages; u32 hd_per_wq; + u32 hd_per_page; u16 hd_per_wqe; + u8 log_hd_per_page; + u8 log_hd_entry_size; unsigned long *bitmap; u16 pi; u16 ci; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 0a4fb8c92268..35d9530037a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -804,7 +804,8 @@ static int mlx5e_xfrm_add_state(struct net_device *dev, goto err_xfrm; } - if (mlx5_eswitch_block_mode(priv->mdev)) + err = mlx5_eswitch_block_mode(priv->mdev); + if (err) goto unblock_ipsec; if (x->props.mode == XFRM_MODE_TUNNEL && diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index d166c0d5189e..cf8f14ce4cd5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -595,32 +595,55 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev, struct mlx5_core_dev *mdev = priv->mdev; u8 max_bw_value[IEEE_8021QAZ_MAX_TCS]; u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS]; - __u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB); + __u64 upper_limit_mbps; + __u64 upper_limit_gbps; int i; + struct { + int scale; + const char *units_str; + } units[] = { + [MLX5_100_MBPS_UNIT] = { + .scale = 100, + .units_str = "Mbps", + }, + [MLX5_GBPS_UNIT] = { + .scale = 1, + .units_str = "Gbps", + }, + }; memset(max_bw_value, 0, sizeof(max_bw_value)); memset(max_bw_unit, 0, sizeof(max_bw_unit)); + upper_limit_mbps = 255 * MLX5E_100MB; + upper_limit_gbps = 255 * MLX5E_1GB; for (i = 0; i <= mlx5_max_tc(mdev); i++) { if (!maxrate->tc_maxrate[i]) { max_bw_unit[i] = MLX5_BW_NO_LIMIT; continue; } - if (maxrate->tc_maxrate[i] < upper_limit_mbps) { + if (maxrate->tc_maxrate[i] <= upper_limit_mbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], MLX5E_100MB); max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1; max_bw_unit[i] = MLX5_100_MBPS_UNIT; - } else { + } else if (maxrate->tc_maxrate[i] <= upper_limit_gbps) { max_bw_value[i] = div_u64(maxrate->tc_maxrate[i], MLX5E_1GB); max_bw_unit[i] = MLX5_GBPS_UNIT; + } else { + netdev_err(netdev, + "tc_%d maxrate %llu Kbps exceeds limit %llu\n", + i, maxrate->tc_maxrate[i], + upper_limit_gbps); + return -EINVAL; } } for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { - netdev_dbg(netdev, "%s: tc_%d <=> max_bw %d Gbps\n", - __func__, i, max_bw_value[i]); + netdev_dbg(netdev, "%s: tc_%d <=> max_bw %u %s\n", __func__, i, + max_bw_value[i] * units[max_bw_unit[i]].scale, + units[max_bw_unit[i]].units_str); } return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 53e5ae252eac..893e1380a7c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -2125,14 +2125,12 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, if (!size_read) return i; - if (size_read == -EINVAL) - return -EINVAL; if (size_read < 0) { NL_SET_ERR_MSG_FMT_MOD( extack, "Query module eeprom by page failed, read %u bytes, err %d", i, size_read); - return i; + return size_read; } i += size_read; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9c46511e7b43..5e17eae81f4b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -791,8 +791,9 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, int node) { void *wqc = MLX5_ADDR_OF(rqc, rqp->rqc, wq); + u8 log_hd_per_page, log_hd_entry_size; + u16 hd_per_wq, hd_per_wqe; u32 hd_pool_size; - u16 hd_per_wq; int wq_size; int err; @@ -815,11 +816,24 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, if (err) goto err_umr_mkey; - rq->mpwqe.shampo->hd_per_wqe = - mlx5e_shampo_hd_per_wqe(mdev, params, rqp); + hd_per_wqe = mlx5e_shampo_hd_per_wqe(mdev, params, rqp); wq_size = BIT(MLX5_GET(wq, wqc, log_wq_sz)); - hd_pool_size = (rq->mpwqe.shampo->hd_per_wqe * wq_size) / - MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; + + BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT); + if (hd_per_wqe >= MLX5E_SHAMPO_WQ_HEADER_PER_PAGE) { + log_hd_per_page = MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE; + log_hd_entry_size = MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; + } else { + log_hd_per_page = order_base_2(hd_per_wqe); + log_hd_entry_size = order_base_2(PAGE_SIZE / hd_per_wqe); + } + + rq->mpwqe.shampo->hd_per_wqe = hd_per_wqe; + rq->mpwqe.shampo->hd_per_page = BIT(log_hd_per_page); + rq->mpwqe.shampo->log_hd_per_page = log_hd_per_page; + rq->mpwqe.shampo->log_hd_entry_size = log_hd_entry_size; + + hd_pool_size = (hd_per_wqe * wq_size) >> log_hd_per_page; if (netif_rxq_has_unreadable_mp(rq->netdev, rq->ix)) { /* Separate page pool for shampo headers */ @@ -2205,7 +2219,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; - *mcq->arm_db = 0; mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 26621a2972ec..687cf123211d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -648,17 +648,20 @@ static void build_ksm_umr(struct mlx5e_icosq *sq, struct mlx5e_umr_wqe *umr_wqe, umr_wqe->hdr.uctrl.mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); } -static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, int header_index) +static struct mlx5e_frag_page *mlx5e_shampo_hd_to_frag_page(struct mlx5e_rq *rq, + int header_index) { - BUILD_BUG_ON(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE > PAGE_SHIFT); + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - return &rq->mpwqe.shampo->pages[header_index >> MLX5E_SHAMPO_LOG_WQ_HEADER_PER_PAGE]; + return &shampo->pages[header_index >> shampo->log_hd_per_page]; } -static u64 mlx5e_shampo_hd_offset(int header_index) +static u64 mlx5e_shampo_hd_offset(struct mlx5e_rq *rq, int header_index) { - return (header_index & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) << - MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE; + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; + u32 hd_per_page = shampo->hd_per_page; + + return (header_index & (hd_per_page - 1)) << shampo->log_hd_entry_size; } static void mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index); @@ -671,7 +674,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, u16 pi, header_offset, err, wqe_bbs; u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; struct mlx5e_umr_wqe *umr_wqe; - int headroom, i = 0; + int headroom, i; headroom = rq->buff.headroom; wqe_bbs = MLX5E_KSM_UMR_WQEBBS(ksm_entries); @@ -679,25 +682,24 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, umr_wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); build_ksm_umr(sq, umr_wqe, shampo->mkey_be, index, ksm_entries); - WARN_ON_ONCE(ksm_entries & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)); - while (i < ksm_entries) { - struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + for (i = 0; i < ksm_entries; i++, index++) { + struct mlx5e_frag_page *frag_page; u64 addr; - err = mlx5e_page_alloc_fragmented(rq->hd_page_pool, frag_page); - if (unlikely(err)) - goto err_unmap; + frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); + header_offset = mlx5e_shampo_hd_offset(rq, index); + if (!header_offset) { + err = mlx5e_page_alloc_fragmented(rq->hd_page_pool, + frag_page); + if (err) + goto err_unmap; + } addr = page_pool_get_dma_addr_netmem(frag_page->netmem); - - for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) { - header_offset = mlx5e_shampo_hd_offset(index++); - - umr_wqe->inline_ksms[i++] = (struct mlx5_ksm) { - .key = cpu_to_be32(lkey), - .va = cpu_to_be64(addr + header_offset + headroom), - }; - } + umr_wqe->inline_ksms[i] = (struct mlx5_ksm) { + .key = cpu_to_be32(lkey), + .va = cpu_to_be64(addr + header_offset + headroom), + }; } sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) { @@ -713,9 +715,9 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, return 0; err_unmap: - while (--i) { + while (--i >= 0) { --index; - header_offset = mlx5e_shampo_hd_offset(index); + header_offset = mlx5e_shampo_hd_offset(rq, index); if (!header_offset) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, index); @@ -735,12 +737,11 @@ static int mlx5e_alloc_rx_hd_mpwqe(struct mlx5e_rq *rq) struct mlx5e_icosq *sq = rq->icosq; int i, err, max_ksm_entries, len; - max_ksm_entries = ALIGN_DOWN(MLX5E_MAX_KSM_PER_WQE(rq->mdev), - MLX5E_SHAMPO_WQ_HEADER_PER_PAGE); + max_ksm_entries = MLX5E_MAX_KSM_PER_WQE(rq->mdev); ksm_entries = bitmap_find_window(shampo->bitmap, shampo->hd_per_wqe, shampo->hd_per_wq, shampo->pi); - ksm_entries = ALIGN_DOWN(ksm_entries, MLX5E_SHAMPO_WQ_HEADER_PER_PAGE); + ksm_entries = ALIGN_DOWN(ksm_entries, shampo->hd_per_page); if (!ksm_entries) return 0; @@ -858,7 +859,7 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index) { struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) { + if (((header_index + 1) & (shampo->hd_per_page - 1)) == 0) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); mlx5e_page_release_fragmented(rq->hd_page_pool, frag_page); @@ -1225,9 +1226,10 @@ static unsigned int mlx5e_lro_update_hdr(struct sk_buff *skb, static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - u16 head_offset = mlx5e_shampo_hd_offset(header_index) + rq->buff.headroom; + u16 head_offset = mlx5e_shampo_hd_offset(rq, header_index); + void *addr = netmem_address(frag_page->netmem); - return netmem_address(frag_page->netmem) + head_offset; + return addr + head_offset + rq->buff.headroom; } static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4) @@ -2267,7 +2269,8 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 header_index) { struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); - u16 head_offset = mlx5e_shampo_hd_offset(header_index); + u16 head_offset = mlx5e_shampo_hd_offset(rq, header_index); + struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; u16 head_size = cqe->shampo.header_size; u16 rx_headroom = rq->buff.headroom; struct sk_buff *skb = NULL; @@ -2283,7 +2286,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, data = hdr + rx_headroom; frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + head_size); - if (likely(frag_size <= BIT(MLX5E_SHAMPO_LOG_MAX_HEADER_ENTRY_SIZE))) { + if (likely(frag_size <= BIT(shampo->log_hd_entry_size))) { /* build SKB around header */ dma_sync_single_range_for_cpu(rq->pdev, dma_addr, 0, frag_size, rq->buff.map_dir); net_prefetchw(hdr); @@ -2356,7 +2359,10 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; + if (PAGE_SIZE >= GRO_LEGACY_MAX_SIZE) + return skb->len + data_bcnt <= GRO_LEGACY_MAX_SIZE; + else + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; } static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index cb1319974f83..ccef64fb40b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -421,6 +421,13 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) __be64 *pas; u32 i; + conn->cq.mcq.cqe_sz = 64; + conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; + conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; + *conn->cq.mcq.set_ci_db = 0; + conn->cq.mcq.vector = 0; + conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; + cq_size = roundup_pow_of_two(cq_size); MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size)); @@ -468,15 +475,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) if (err) goto err_cqwq; - conn->cq.mcq.cqe_sz = 64; - conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db; - conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1; - *conn->cq.mcq.set_ci_db = 0; - *conn->cq.mcq.arm_db = 0; - conn->cq.mcq.vector = 0; - conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); - mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); goto out; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index e18a850c615c..aa3b5878e3da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -324,10 +324,8 @@ err_xa: free_irq(irq->map.virq, &irq->nh); err_req_irq: #ifdef CONFIG_RFS_ACCEL - if (i && rmap && *rmap) { - free_irq_cpu_rmap(*rmap); - *rmap = NULL; - } + if (i && rmap && *rmap) + irq_cpu_rmap_remove(*rmap, irq->map.virq); err_irq_rmap: #endif if (i && pci_msix_can_alloc_dyn(dev->pdev)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index 24ef7d66fa8a..7510c46e58a5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -873,12 +873,6 @@ err_free_sqc: return err; } -static void hws_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5hws_send_engine *queue, @@ -901,7 +895,6 @@ static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev, mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; - mcq->comp = hws_cq_complete; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { cqe = mlx5_cqwq_get_wqe(&cq->wq, i); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 077a77fde670..d034372fa047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1049,12 +1049,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) return 0; } -static void dr_cq_complete(struct mlx5_core_cq *mcq, - struct mlx5_eqe *eqe) -{ - pr_err("CQ completion CQ: #%u\n", mcq->cqn); -} - static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -1089,6 +1083,13 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK; } + cq->mcq.cqe_sz = 64; + cq->mcq.set_ci_db = cq->wq_ctrl.db.db; + cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; + *cq->mcq.set_ci_db = 0; + cq->mcq.vector = 0; + cq->mdev = mdev; + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = kvzalloc(inlen, GFP_KERNEL); @@ -1112,27 +1113,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); - cq->mcq.comp = dr_cq_complete; - err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); if (err) goto err_cqwq; - cq->mcq.cqe_sz = 64; - cq->mcq.set_ci_db = cq->wq_ctrl.db.db; - cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; - *cq->mcq.set_ci_db = 0; - - /* set no-zero value, in order to avoid the HW to run db-recovery on - * CQ that used in polling mode. - */ - *cq->mcq.arm_db = cpu_to_be32(2 << 28); - - cq->mcq.vector = 0; - cq->mdev = mdev; - return cq; err_cqwq: diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c index b032d5a4b3b8..10f5bc4892fc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c @@ -601,6 +601,8 @@ int mlxsw_linecard_devlink_info_get(struct mlxsw_linecard *linecard, err = devlink_info_version_fixed_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_PSID, info->psid); + if (err) + goto unlock; sprintf(buf, "%u.%u.%u", info->fw_major, info->fw_minor, info->fw_sub_minor); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 6a4a81c63451..353fd9ca89a6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -830,8 +830,10 @@ int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, return -EINVAL; rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie); - if (!rule) - return -EINVAL; + if (!rule) { + err = -EINVAL; + goto err_rule_get_stats; + } err = mlxsw_sp_acl_rule_get_stats(mlxsw_sp, rule, &packets, &bytes, &drops, &lastuse, &used_hw_stats); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index c87cb9ed09e7..fcd9912e7ad3 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -201,7 +201,7 @@ static int fbnic_mbx_alloc_rx_msgs(struct fbnic_dev *fbd) return -ENODEV; /* Fill all but 1 unused descriptors in the Rx queue. */ - count = (head - tail - 1) % FBNIC_IPC_MBX_DESC_LEN; + count = (head - tail - 1) & (FBNIC_IPC_MBX_DESC_LEN - 1); while (!err && count--) { struct fbnic_tlv_msg *msg; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ethtool.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ethtool.c index 2474dfd330f4..fe4e61405284 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ethtool.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ethtool.c @@ -294,7 +294,7 @@ static void lan966x_stats_update(struct lan966x *lan966x) { int i, j; - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); for (i = 0; i < lan966x->num_phys_ports; i++) { uint idx = i * lan966x->num_stats; @@ -310,7 +310,7 @@ static void lan966x_stats_update(struct lan966x *lan966x) } } - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); } static int lan966x_get_sset_count(struct net_device *dev, int sset) @@ -365,7 +365,7 @@ static void lan966x_get_eth_mac_stats(struct net_device *dev, idx = port->chip_port * lan966x->num_stats; - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); mac_stats->FramesTransmittedOK = lan966x->stats[idx + SYS_COUNT_TX_UC] + @@ -416,7 +416,7 @@ static void lan966x_get_eth_mac_stats(struct net_device *dev, lan966x->stats[idx + SYS_COUNT_RX_LONG] + lan966x->stats[idx + SYS_COUNT_RX_PMAC_LONG]; - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); } static const struct ethtool_rmon_hist_range lan966x_rmon_ranges[] = { @@ -442,7 +442,7 @@ static void lan966x_get_eth_rmon_stats(struct net_device *dev, idx = port->chip_port * lan966x->num_stats; - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); rmon_stats->undersize_pkts = lan966x->stats[idx + SYS_COUNT_RX_SHORT] + @@ -500,7 +500,7 @@ static void lan966x_get_eth_rmon_stats(struct net_device *dev, lan966x->stats[idx + SYS_COUNT_TX_SZ_1024_1526] + lan966x->stats[idx + SYS_COUNT_TX_PMAC_SZ_1024_1526]; - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); *ranges = lan966x_rmon_ranges; } @@ -603,7 +603,7 @@ void lan966x_stats_get(struct net_device *dev, idx = port->chip_port * lan966x->num_stats; - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); stats->rx_bytes = lan966x->stats[idx + SYS_COUNT_RX_OCT] + lan966x->stats[idx + SYS_COUNT_RX_PMAC_OCT]; @@ -685,7 +685,7 @@ void lan966x_stats_get(struct net_device *dev, stats->collisions = lan966x->stats[idx + SYS_COUNT_TX_COL]; - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); } int lan966x_stats_init(struct lan966x *lan966x) @@ -701,7 +701,7 @@ int lan966x_stats_init(struct lan966x *lan966x) return -ENOMEM; /* Init stats worker */ - mutex_init(&lan966x->stats_lock); + spin_lock_init(&lan966x->stats_lock); snprintf(queue_name, sizeof(queue_name), "%s-stats", dev_name(lan966x->dev)); lan966x->stats_queue = create_singlethread_workqueue(queue_name); diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 7001584f1b7a..47752d3fde0b 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -1261,7 +1261,6 @@ cleanup_ports: cancel_delayed_work_sync(&lan966x->stats_work); destroy_workqueue(lan966x->stats_queue); - mutex_destroy(&lan966x->stats_lock); debugfs_remove_recursive(lan966x->debugfs_root); @@ -1279,7 +1278,6 @@ static void lan966x_remove(struct platform_device *pdev) cancel_delayed_work_sync(&lan966x->stats_work); destroy_workqueue(lan966x->stats_queue); - mutex_destroy(&lan966x->stats_lock); lan966x_mac_purge_entries(lan966x); lan966x_mdb_deinit(lan966x); diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h index 4f75f0688369..eea286c29474 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h @@ -295,8 +295,8 @@ struct lan966x { const struct lan966x_stat_layout *stats_layout; u32 num_stats; - /* workqueue for reading stats */ - struct mutex stats_lock; + /* lock for reading stats */ + spinlock_t stats_lock; u64 *stats; struct delayed_work stats_work; struct workqueue_struct *stats_queue; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index b4377b8613c3..8c40db90ee8f 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0+ #include <linux/ptp_classify.h> +#include <linux/units.h> #include "lan966x_main.h" #include "vcap_api.h" #include "vcap_api_client.h" +#define LAN9X66_CLOCK_RATE 165617754 + #define LAN966X_MAX_PTP_ID 512 /* Represents 1ppm adjustment in 2^59 format with 6.037735849ns as reference @@ -1126,5 +1129,5 @@ void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb, u32 lan966x_ptp_get_period_ps(void) { /* This represents the system clock period in picoseconds */ - return 15125; + return PICO / LAN9X66_CLOCK_RATE; } diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_vcap_impl.c b/drivers/net/ethernet/microchip/lan966x/lan966x_vcap_impl.c index a1471e38d118..2a37fc1ba4bc 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_vcap_impl.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_vcap_impl.c @@ -403,11 +403,11 @@ static void lan966x_es0_read_esdx_counter(struct lan966x *lan966x, u32 counter; id = id & 0xff; /* counter limit */ - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); lan_wr(SYS_STAT_CFG_STAT_VIEW_SET(id), lan966x, SYS_STAT_CFG); counter = lan_rd(lan966x, SYS_CNT(LAN966X_STAT_ESDX_GRN_PKTS)) + lan_rd(lan966x, SYS_CNT(LAN966X_STAT_ESDX_YEL_PKTS)); - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); if (counter) admin->cache.counter = counter; } @@ -417,14 +417,14 @@ static void lan966x_es0_write_esdx_counter(struct lan966x *lan966x, { id = id & 0xff; /* counter limit */ - mutex_lock(&lan966x->stats_lock); + spin_lock(&lan966x->stats_lock); lan_wr(SYS_STAT_CFG_STAT_VIEW_SET(id), lan966x, SYS_STAT_CFG); lan_wr(0, lan966x, SYS_CNT(LAN966X_STAT_ESDX_GRN_BYTES)); lan_wr(admin->cache.counter, lan966x, SYS_CNT(LAN966X_STAT_ESDX_GRN_PKTS)); lan_wr(0, lan966x, SYS_CNT(LAN966X_STAT_ESDX_YEL_BYTES)); lan_wr(0, lan966x, SYS_CNT(LAN966X_STAT_ESDX_YEL_PKTS)); - mutex_unlock(&lan966x->stats_lock); + spin_unlock(&lan966x->stats_lock); } static void lan966x_vcap_cache_write(struct net_device *dev, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index d10b58ebf603..301ebee2fdc5 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -29,6 +29,10 @@ static void ionic_tx_clean(struct ionic_queue *q, static inline void ionic_txq_post(struct ionic_queue *q, bool ring_dbell) { + /* Ensure TX descriptor writes reach memory before NIC reads them. + * Prevents device from fetching stale descriptors. + */ + dma_wmb(); ionic_q_post(q, ring_dbell); } @@ -1444,19 +1448,6 @@ static int ionic_tx_tso(struct net_device *netdev, struct ionic_queue *q, bool encap; int err; - desc_info = &q->tx_info[q->head_idx]; - - if (unlikely(ionic_tx_map_skb(q, skb, desc_info))) - return -EIO; - - len = skb->len; - mss = skb_shinfo(skb)->gso_size; - outer_csum = (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPXIP4 | - SKB_GSO_IPXIP6 | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM)); has_vlan = !!skb_vlan_tag_present(skb); vlan_tci = skb_vlan_tag_get(skb); encap = skb->encapsulation; @@ -1470,12 +1461,21 @@ static int ionic_tx_tso(struct net_device *netdev, struct ionic_queue *q, err = ionic_tx_tcp_inner_pseudo_csum(skb); else err = ionic_tx_tcp_pseudo_csum(skb); - if (unlikely(err)) { - /* clean up mapping from ionic_tx_map_skb */ - ionic_tx_desc_unmap_bufs(q, desc_info); + if (unlikely(err)) return err; - } + desc_info = &q->tx_info[q->head_idx]; + if (unlikely(ionic_tx_map_skb(q, skb, desc_info))) + return -EIO; + + len = skb->len; + mss = skb_shinfo(skb)->gso_size; + outer_csum = (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | + SKB_GSO_GRE_CSUM | + SKB_GSO_IPXIP4 | + SKB_GSO_IPXIP6 | + SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM)); if (encap) hdrlen = skb_inner_tcp_all_headers(skb); else diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 847fa62c80df..e338bfc8b7b2 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -4,6 +4,7 @@ * Copyright (c) 2019-2020 Marvell International Ltd. */ +#include <linux/array_size.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> @@ -960,7 +961,7 @@ static inline void qede_tpa_cont(struct qede_dev *edev, { int i; - for (i = 0; cqe->len_list[i]; i++) + for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++) qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index, le16_to_cpu(cqe->len_list[i])); @@ -985,7 +986,7 @@ static int qede_tpa_end(struct qede_dev *edev, dma_unmap_page(rxq->dev, tpa_info->buffer.mapping, PAGE_SIZE, rxq->data_direction); - for (i = 0; cqe->len_list[i]; i++) + for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++) qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index, le16_to_cpu(cqe->len_list[i])); if (unlikely(i > 1)) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index d18734fe12e4..853aabedb128 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1514,11 +1514,20 @@ static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp) static void rtl_set_d3_pll_down(struct rtl8169_private *tp, bool enable) { - if (tp->mac_version >= RTL_GIGA_MAC_VER_25 && - tp->mac_version != RTL_GIGA_MAC_VER_28 && - tp->mac_version != RTL_GIGA_MAC_VER_31 && - tp->mac_version != RTL_GIGA_MAC_VER_38) - r8169_mod_reg8_cond(tp, PMCH, D3_NO_PLL_DOWN, !enable); + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_24: + case RTL_GIGA_MAC_VER_28: + case RTL_GIGA_MAC_VER_31: + case RTL_GIGA_MAC_VER_38: + break; + case RTL_GIGA_MAC_VER_80: + r8169_mod_reg8_cond(tp, PMCH, D3_NO_PLL_DOWN, true); + break; + default: + r8169_mod_reg8_cond(tp, PMCH, D3HOT_NO_PLL_DOWN, true); + r8169_mod_reg8_cond(tp, PMCH, D3COLD_NO_PLL_DOWN, !enable); + break; + } } static void rtl_reset_packet_filter(struct rtl8169_private *tp) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index 75bad561b352..849c5a6c2af1 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -1521,8 +1521,10 @@ static int sxgbe_rx(struct sxgbe_priv_data *priv, int limit) skb = priv->rxq[qnum]->rx_skbuff[entry]; - if (unlikely(!skb)) + if (unlikely(!skb)) { netdev_err(priv->dev, "rx descriptor is not consistent\n"); + break; + } prefetch(skb->data - NET_IP_ALIGN); priv->rxq[qnum]->rx_skbuff[entry] = NULL; diff --git a/drivers/net/ethernet/spacemit/k1_emac.c b/drivers/net/ethernet/spacemit/k1_emac.c index e1c5faff3b71..220eb5ce7583 100644 --- a/drivers/net/ethernet/spacemit/k1_emac.c +++ b/drivers/net/ethernet/spacemit/k1_emac.c @@ -1441,6 +1441,9 @@ static int emac_set_pauseparam(struct net_device *dev, struct emac_priv *priv = netdev_priv(dev); u8 fc = 0; + if (!netif_running(dev)) + return -ENETDOWN; + priv->flow_control_autoneg = pause->autoneg; if (pause->autoneg) { diff --git a/drivers/net/ethernet/ti/am65-cpsw-qos.c b/drivers/net/ethernet/ti/am65-cpsw-qos.c index fa96db7c1a13..66e8b224827b 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-qos.c +++ b/drivers/net/ethernet/ti/am65-cpsw-qos.c @@ -276,9 +276,31 @@ static int am65_cpsw_iet_set_verify_timeout_count(struct am65_cpsw_port *port) /* The number of wireside clocks contained in the verify * timeout counter. The default is 0x1312d0 * (10ms at 125Mhz in 1G mode). + * The frequency of the clock depends on the link speed + * and the PHY interface. */ - val = 125 * HZ_PER_MHZ; /* assuming 125MHz wireside clock */ + switch (port->slave.phy_if) { + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: + if (port->qos.link_speed == SPEED_1000) + val = 125 * HZ_PER_MHZ; /* 125 MHz at 1000Mbps*/ + else if (port->qos.link_speed == SPEED_100) + val = 25 * HZ_PER_MHZ; /* 25 MHz at 100Mbps*/ + else + val = (25 * HZ_PER_MHZ) / 10; /* 2.5 MHz at 10Mbps*/ + break; + + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_SGMII: + val = 125 * HZ_PER_MHZ; /* 125 MHz */ + break; + default: + netdev_err(port->ndev, "selected mode does not supported IET\n"); + return -EOPNOTSUPP; + } val /= MILLIHZ_PER_HZ; /* count per ms timeout */ val *= verify_time_ms; /* count for timeout ms */ @@ -295,20 +317,21 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port) u32 ctrl, status; int try; - try = 20; - do { - /* Reset the verify state machine by writing 1 - * to LINKFAIL - */ - ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL; - writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + try = 3; - /* Clear MAC_LINKFAIL bit to start Verify. */ - ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); - ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL; - writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + /* Reset the verify state machine by writing 1 + * to LINKFAIL + */ + ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL; + writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + /* Clear MAC_LINKFAIL bit to start Verify. */ + ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL; + writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL); + + do { msleep(port->qos.iet.verify_time_ms); status = readl(port->port_base + AM65_CPSW_PN_REG_IET_STATUS); @@ -330,7 +353,7 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port) netdev_dbg(port->ndev, "MAC Merge verify error\n"); return -ENODEV; } - } while (try-- > 0); + } while (--try > 0); netdev_dbg(port->ndev, "MAC Merge verify timeout\n"); return -ETIMEDOUT; diff --git a/drivers/net/ethernet/ti/icssg/icssg_config.c b/drivers/net/ethernet/ti/icssg/icssg_config.c index da53eb04b0a4..3f8237c17d09 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_config.c +++ b/drivers/net/ethernet/ti/icssg/icssg_config.c @@ -66,6 +66,9 @@ #define FDB_GEN_CFG1 0x60 #define SMEM_VLAN_OFFSET 8 #define SMEM_VLAN_OFFSET_MASK GENMASK(25, 8) +#define FDB_HASH_SIZE_MASK GENMASK(6, 3) +#define FDB_HASH_SIZE_SHIFT 3 +#define FDB_HASH_SIZE 3 #define FDB_GEN_CFG2 0x64 #define FDB_VLAN_EN BIT(6) @@ -463,6 +466,8 @@ void icssg_init_emac_mode(struct prueth *prueth) /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); + regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, FDB_HASH_SIZE_MASK, + FDB_HASH_SIZE << FDB_HASH_SIZE_SHIFT); /* Set enable VLAN aware mode, and FDBs for all PRUs */ regmap_write(prueth->miig_rt, FDB_GEN_CFG2, (FDB_PRU0_EN | FDB_PRU1_EN | FDB_HOST_EN)); prueth->vlan_tbl = (struct prueth_vlan_tbl __force *)(prueth->shram.va + @@ -484,6 +489,8 @@ void icssg_init_fw_offload_mode(struct prueth *prueth) /* Set VLAN TABLE address base */ regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, SMEM_VLAN_OFFSET_MASK, addr << SMEM_VLAN_OFFSET); + regmap_update_bits(prueth->miig_rt, FDB_GEN_CFG1, FDB_HASH_SIZE_MASK, + FDB_HASH_SIZE << FDB_HASH_SIZE_SHIFT); /* Set enable VLAN aware mode, and FDBs for all PRUs */ regmap_write(prueth->miig_rt, FDB_GEN_CFG2, FDB_EN_ALL); prueth->vlan_tbl = (struct prueth_vlan_tbl __force *)(prueth->shram.va + diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 857820657bac..5ee13db568f0 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1338,10 +1338,10 @@ int netcp_txpipe_open(struct netcp_tx_pipe *tx_pipe) tx_pipe->dma_channel = knav_dma_open_channel(dev, tx_pipe->dma_chan_name, &config); - if (IS_ERR(tx_pipe->dma_channel)) { + if (!tx_pipe->dma_channel) { dev_err(dev, "failed opening tx chan(%s)\n", tx_pipe->dma_chan_name); - ret = PTR_ERR(tx_pipe->dma_channel); + ret = -EINVAL; goto err; } @@ -1359,7 +1359,7 @@ int netcp_txpipe_open(struct netcp_tx_pipe *tx_pipe) return 0; err: - if (!IS_ERR_OR_NULL(tx_pipe->dma_channel)) + if (tx_pipe->dma_channel) knav_dma_close_channel(tx_pipe->dma_channel); tx_pipe->dma_channel = NULL; return ret; @@ -1678,10 +1678,10 @@ static int netcp_setup_navigator_resources(struct net_device *ndev) netcp->rx_channel = knav_dma_open_channel(netcp->netcp_device->device, netcp->dma_chan_name, &config); - if (IS_ERR(netcp->rx_channel)) { + if (!netcp->rx_channel) { dev_err(netcp->ndev_dev, "failed opening rx chan(%s\n", netcp->dma_chan_name); - ret = PTR_ERR(netcp->rx_channel); + ret = -EINVAL; goto fail; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c index 5ee8e8980393..591866fc9055 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c @@ -260,6 +260,7 @@ void gelic_card_down(struct gelic_card *card) if (atomic_dec_if_positive(&card->users) == 0) { pr_debug("%s: real do\n", __func__); napi_disable(&card->napi); + timer_delete_sync(&card->rx_oom_timer); /* * Disable irq. Wireless interrupts will * be disabled later if any @@ -970,7 +971,8 @@ static void gelic_net_pass_skb_up(struct gelic_descr *descr, * gelic_card_decode_one_descr - processes an rx descriptor * @card: card structure * - * returns 1 if a packet has been sent to the stack, otherwise 0 + * returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc + * failure, otherwise 0 * * processes an rx descriptor by iommu-unmapping the data buffer and passing * the packet up to the stack @@ -981,16 +983,18 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) struct gelic_descr_chain *chain = &card->rx_chain; struct gelic_descr *descr = chain->head; struct net_device *netdev = NULL; - int dmac_chain_ended; + int dmac_chain_ended = 0; + int prepare_rx_ret; status = gelic_descr_get_status(descr); if (status == GELIC_DESCR_DMA_CARDOWNED) return 0; - if (status == GELIC_DESCR_DMA_NOT_IN_USE) { + if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) { dev_dbg(ctodev(card), "dormant descr? %p\n", descr); - return 0; + dmac_chain_ended = 1; + goto refill; } /* netdevice select */ @@ -1048,9 +1052,10 @@ static int gelic_card_decode_one_descr(struct gelic_card *card) refill: /* is the current descriptor terminated with next_descr == NULL? */ - dmac_chain_ended = - be32_to_cpu(descr->hw_regs.dmac_cmd_status) & - GELIC_DESCR_RX_DMA_CHAIN_END; + if (!dmac_chain_ended) + dmac_chain_ended = + be32_to_cpu(descr->hw_regs.dmac_cmd_status) & + GELIC_DESCR_RX_DMA_CHAIN_END; /* * So that always DMAC can see the end * of the descriptor chain to avoid @@ -1062,10 +1067,11 @@ refill: gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE); /* - * this call can fail, but for now, just leave this - * descriptor without skb + * this call can fail, propagate the error */ - gelic_descr_prepare_rx(card, descr); + prepare_rx_ret = gelic_descr_prepare_rx(card, descr); + if (prepare_rx_ret) + return prepare_rx_ret; chain->tail = descr; chain->head = descr->next; @@ -1087,6 +1093,13 @@ refill: return 1; } +static void gelic_rx_oom_timer(struct timer_list *t) +{ + struct gelic_card *card = timer_container_of(card, t, rx_oom_timer); + + napi_schedule(&card->napi); +} + /** * gelic_net_poll - NAPI poll function called by the stack to return packets * @napi: napi structure @@ -1099,14 +1112,22 @@ static int gelic_net_poll(struct napi_struct *napi, int budget) { struct gelic_card *card = container_of(napi, struct gelic_card, napi); int packets_done = 0; + int work_result = 0; while (packets_done < budget) { - if (!gelic_card_decode_one_descr(card)) + work_result = gelic_card_decode_one_descr(card); + if (work_result != 1) break; packets_done++; } + if (work_result == -ENOMEM) { + napi_complete_done(napi, packets_done); + mod_timer(&card->rx_oom_timer, jiffies + 1); + return packets_done; + } + if (packets_done < budget) { napi_complete_done(napi, packets_done); gelic_card_rx_irq_on(card); @@ -1576,6 +1597,8 @@ static struct gelic_card *gelic_alloc_card_net(struct net_device **netdev) mutex_init(&card->updown_lock); atomic_set(&card->users, 0); + timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0); + return card; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h index f7d7931e51b7..c10f1984a5a1 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h @@ -268,6 +268,7 @@ struct gelic_vlan_id { struct gelic_card { struct napi_struct napi; struct net_device *netdev[GELIC_PORT_MAX]; + struct timer_list rx_oom_timer; /* * hypervisor requires irq_status should be * 8 bytes aligned, but u64 member is diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 1e2713f0c921..b37d6cfbfbe9 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2427,7 +2427,8 @@ int wx_sw_init(struct wx *wx) wx->oem_svid = pdev->subsystem_vendor; wx->oem_ssid = pdev->subsystem_device; wx->bus.device = PCI_SLOT(pdev->devfn); - wx->bus.func = PCI_FUNC(pdev->devfn); + wx->bus.func = FIELD_GET(WX_CFG_PORT_ST_LANID, + rd32(wx, WX_CFG_PORT_ST)); if (wx->oem_svid == PCI_VENDOR_ID_WANGXUN || pdev->is_virtfn) { diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index d89b9b8a0a2c..2f8319e03182 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -97,6 +97,8 @@ #define WX_CFG_PORT_CTL_DRV_LOAD BIT(3) #define WX_CFG_PORT_CTL_QINQ BIT(2) #define WX_CFG_PORT_CTL_D_VLAN BIT(0) /* double vlan*/ +#define WX_CFG_PORT_ST 0x14404 +#define WX_CFG_PORT_ST_LANID GENMASK(9, 8) #define WX_CFG_TAG_TPID(_i) (0x14430 + ((_i) * 4)) #define WX_CFG_PORT_CTL_NUM_VT_MASK GENMASK(13, 12) /* number of TVs */ @@ -557,8 +559,6 @@ enum WX_MSCA_CMD_value { #define TXD_USE_COUNT(S) DIV_ROUND_UP((S), WX_MAX_DATA_PER_TXD) #define DESC_NEEDED (MAX_SKB_FRAGS + 4) -#define WX_CFG_PORT_ST 0x14404 - /******************* Receive Descriptor bit definitions **********************/ #define WX_RXD_STAT_DD BIT(0) /* Done */ #define WX_RXD_STAT_EOP BIT(1) /* End of Packet */ diff --git a/drivers/net/mdio/mdio-airoha.c b/drivers/net/mdio/mdio-airoha.c index 1dc9939c8d7d..52e7475121ea 100644 --- a/drivers/net/mdio/mdio-airoha.c +++ b/drivers/net/mdio/mdio-airoha.c @@ -219,6 +219,8 @@ static int airoha_mdio_probe(struct platform_device *pdev) priv = bus->priv; priv->base_addr = addr; priv->regmap = device_node_to_regmap(dev->parent->of_node); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); priv->clk = devm_clk_get_enabled(dev, NULL); if (IS_ERR(priv->clk)) diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 5d8d0214786c..bb6e03a92956 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -936,6 +936,7 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, if (count > MAX_EXTRADATA_VALUE_LEN) return -EMSGSIZE; + mutex_lock(&netconsole_subsys.su_mutex); mutex_lock(&dynamic_netconsole_mutex); ret = strscpy(udm->value, buf, sizeof(udm->value)); @@ -949,6 +950,7 @@ static ssize_t userdatum_value_store(struct config_item *item, const char *buf, ret = count; out_unlock: mutex_unlock(&dynamic_netconsole_mutex); + mutex_unlock(&netconsole_subsys.su_mutex); return ret; } @@ -974,6 +976,7 @@ static ssize_t sysdata_msgid_enabled_store(struct config_item *item, if (ret) return ret; + mutex_lock(&netconsole_subsys.su_mutex); mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_MSGID); if (msgid_enabled == curr) @@ -994,6 +997,7 @@ unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); + mutex_unlock(&netconsole_subsys.su_mutex); return ret; } @@ -1008,6 +1012,7 @@ static ssize_t sysdata_release_enabled_store(struct config_item *item, if (ret) return ret; + mutex_lock(&netconsole_subsys.su_mutex); mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_RELEASE); if (release_enabled == curr) @@ -1028,6 +1033,7 @@ unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); + mutex_unlock(&netconsole_subsys.su_mutex); return ret; } @@ -1042,6 +1048,7 @@ static ssize_t sysdata_taskname_enabled_store(struct config_item *item, if (ret) return ret; + mutex_lock(&netconsole_subsys.su_mutex); mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_TASKNAME); if (taskname_enabled == curr) @@ -1062,6 +1069,7 @@ unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); + mutex_unlock(&netconsole_subsys.su_mutex); return ret; } @@ -1077,6 +1085,7 @@ static ssize_t sysdata_cpu_nr_enabled_store(struct config_item *item, if (ret) return ret; + mutex_lock(&netconsole_subsys.su_mutex); mutex_lock(&dynamic_netconsole_mutex); curr = !!(nt->sysdata_fields & SYSDATA_CPU_NR); if (cpu_nr_enabled == curr) @@ -1105,6 +1114,7 @@ unlock_ok: ret = strnlen(buf, count); unlock: mutex_unlock(&dynamic_netconsole_mutex); + mutex_unlock(&netconsole_subsys.su_mutex); return ret; } diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index cad6ed3aa10b..4354241137d5 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -73,8 +73,11 @@ int mdiobus_register_device(struct mdio_device *mdiodev) return err; err = mdiobus_register_reset(mdiodev); - if (err) + if (err) { + gpiod_put(mdiodev->reset_gpio); + mdiodev->reset_gpio = NULL; return err; + } /* Assert the reset signal */ mdio_device_reset(mdiodev, 1); diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 604b5de0c158..01c87c9b7702 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -466,6 +466,12 @@ struct lan8842_priv { u16 rev; }; +struct lanphy_reg_data { + int page; + u16 addr; + u16 val; +}; + static const struct kszphy_type lan8814_type = { .led_mode_reg = ~LAN8814_LED_CTRL_1, .cable_diag_reg = LAN8814_CABLE_DIAG, @@ -2836,6 +2842,13 @@ static int ksz886x_cable_test_get_status(struct phy_device *phydev, #define LAN8814_PAGE_PCS_DIGITAL 2 /** + * LAN8814_PAGE_EEE - Selects Extended Page 3. + * + * This page contains EEE registers + */ +#define LAN8814_PAGE_EEE 3 + +/** * LAN8814_PAGE_COMMON_REGS - Selects Extended Page 4. * * This page contains device-common registers that affect the entire chip. @@ -2854,6 +2867,13 @@ static int ksz886x_cable_test_get_status(struct phy_device *phydev, #define LAN8814_PAGE_PORT_REGS 5 /** + * LAN8814_PAGE_POWER_REGS - Selects Extended Page 28. + * + * This page contains analog control registers and power mode registers. + */ +#define LAN8814_PAGE_POWER_REGS 28 + +/** * LAN8814_PAGE_SYSTEM_CTRL - Selects Extended Page 31. * * This page appears to hold fundamental system or global controls. In the @@ -4360,12 +4380,6 @@ static int lan8814_config_init(struct phy_device *phydev) { struct kszphy_priv *lan8814 = phydev->priv; - /* Reset the PHY */ - lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, - LAN8814_QSGMII_SOFT_RESET, - LAN8814_QSGMII_SOFT_RESET_BIT, - LAN8814_QSGMII_SOFT_RESET_BIT); - /* Disable ANEG with QSGMII PCS Host side */ lanphy_modify_page_reg(phydev, LAN8814_PAGE_PORT_REGS, LAN8814_QSGMII_PCS1G_ANEG_CONFIG, @@ -4451,6 +4465,12 @@ static int lan8814_probe(struct phy_device *phydev) addr, sizeof(struct lan8814_shared_priv)); if (phy_package_init_once(phydev)) { + /* Reset the PHY */ + lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + err = lan8814_release_coma_mode(phydev); if (err) return err; @@ -5884,6 +5904,144 @@ static int lan8842_probe(struct phy_device *phydev) return 0; } +#define LAN8814_POWER_MGMT_MODE_3_ANEG_MDI 0x13 +#define LAN8814_POWER_MGMT_MODE_4_ANEG_MDIX 0x14 +#define LAN8814_POWER_MGMT_MODE_5_10BT_MDI 0x15 +#define LAN8814_POWER_MGMT_MODE_6_10BT_MDIX 0x16 +#define LAN8814_POWER_MGMT_MODE_7_100BT_TRAIN 0x17 +#define LAN8814_POWER_MGMT_MODE_8_100BT_MDI 0x18 +#define LAN8814_POWER_MGMT_MODE_9_100BT_EEE_MDI_TX 0x19 +#define LAN8814_POWER_MGMT_MODE_10_100BT_EEE_MDI_RX 0x1a +#define LAN8814_POWER_MGMT_MODE_11_100BT_MDIX 0x1b +#define LAN8814_POWER_MGMT_MODE_12_100BT_EEE_MDIX_TX 0x1c +#define LAN8814_POWER_MGMT_MODE_13_100BT_EEE_MDIX_RX 0x1d +#define LAN8814_POWER_MGMT_MODE_14_100BTX_EEE_TX_RX 0x1e + +#define LAN8814_POWER_MGMT_DLLPD_D BIT(0) +#define LAN8814_POWER_MGMT_ADCPD_D BIT(1) +#define LAN8814_POWER_MGMT_PGAPD_D BIT(2) +#define LAN8814_POWER_MGMT_TXPD_D BIT(3) +#define LAN8814_POWER_MGMT_DLLPD_C BIT(4) +#define LAN8814_POWER_MGMT_ADCPD_C BIT(5) +#define LAN8814_POWER_MGMT_PGAPD_C BIT(6) +#define LAN8814_POWER_MGMT_TXPD_C BIT(7) +#define LAN8814_POWER_MGMT_DLLPD_B BIT(8) +#define LAN8814_POWER_MGMT_ADCPD_B BIT(9) +#define LAN8814_POWER_MGMT_PGAPD_B BIT(10) +#define LAN8814_POWER_MGMT_TXPD_B BIT(11) +#define LAN8814_POWER_MGMT_DLLPD_A BIT(12) +#define LAN8814_POWER_MGMT_ADCPD_A BIT(13) +#define LAN8814_POWER_MGMT_PGAPD_A BIT(14) +#define LAN8814_POWER_MGMT_TXPD_A BIT(15) + +#define LAN8814_POWER_MGMT_C_D (LAN8814_POWER_MGMT_DLLPD_D | \ + LAN8814_POWER_MGMT_ADCPD_D | \ + LAN8814_POWER_MGMT_PGAPD_D | \ + LAN8814_POWER_MGMT_DLLPD_C | \ + LAN8814_POWER_MGMT_ADCPD_C | \ + LAN8814_POWER_MGMT_PGAPD_C) + +#define LAN8814_POWER_MGMT_B_C_D (LAN8814_POWER_MGMT_C_D | \ + LAN8814_POWER_MGMT_DLLPD_B | \ + LAN8814_POWER_MGMT_ADCPD_B | \ + LAN8814_POWER_MGMT_PGAPD_B) + +#define LAN8814_POWER_MGMT_VAL1 (LAN8814_POWER_MGMT_C_D | \ + LAN8814_POWER_MGMT_ADCPD_B | \ + LAN8814_POWER_MGMT_PGAPD_B | \ + LAN8814_POWER_MGMT_ADCPD_A | \ + LAN8814_POWER_MGMT_PGAPD_A) + +#define LAN8814_POWER_MGMT_VAL2 LAN8814_POWER_MGMT_C_D + +#define LAN8814_POWER_MGMT_VAL3 (LAN8814_POWER_MGMT_C_D | \ + LAN8814_POWER_MGMT_DLLPD_B | \ + LAN8814_POWER_MGMT_ADCPD_B | \ + LAN8814_POWER_MGMT_PGAPD_A) + +#define LAN8814_POWER_MGMT_VAL4 (LAN8814_POWER_MGMT_B_C_D | \ + LAN8814_POWER_MGMT_ADCPD_A | \ + LAN8814_POWER_MGMT_PGAPD_A) + +#define LAN8814_POWER_MGMT_VAL5 LAN8814_POWER_MGMT_B_C_D + +#define LAN8814_EEE_WAKE_TX_TIMER 0x0e +#define LAN8814_EEE_WAKE_TX_TIMER_MAX_VAL 0x1f + +static const struct lanphy_reg_data short_center_tap_errata[] = { + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_3_ANEG_MDI, + LAN8814_POWER_MGMT_VAL1 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_4_ANEG_MDIX, + LAN8814_POWER_MGMT_VAL1 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_5_10BT_MDI, + LAN8814_POWER_MGMT_VAL1 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_6_10BT_MDIX, + LAN8814_POWER_MGMT_VAL1 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_7_100BT_TRAIN, + LAN8814_POWER_MGMT_VAL2 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_8_100BT_MDI, + LAN8814_POWER_MGMT_VAL3 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_9_100BT_EEE_MDI_TX, + LAN8814_POWER_MGMT_VAL3 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_10_100BT_EEE_MDI_RX, + LAN8814_POWER_MGMT_VAL4 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_11_100BT_MDIX, + LAN8814_POWER_MGMT_VAL5 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_12_100BT_EEE_MDIX_TX, + LAN8814_POWER_MGMT_VAL5 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_13_100BT_EEE_MDIX_RX, + LAN8814_POWER_MGMT_VAL4 }, + { LAN8814_PAGE_POWER_REGS, + LAN8814_POWER_MGMT_MODE_14_100BTX_EEE_TX_RX, + LAN8814_POWER_MGMT_VAL4 }, +}; + +static const struct lanphy_reg_data waketx_timer_errata[] = { + { LAN8814_PAGE_EEE, + LAN8814_EEE_WAKE_TX_TIMER, + LAN8814_EEE_WAKE_TX_TIMER_MAX_VAL }, +}; + +static int lanphy_write_reg_data(struct phy_device *phydev, + const struct lanphy_reg_data *data, + size_t num) +{ + int ret = 0; + + while (num--) { + ret = lanphy_write_page_reg(phydev, data->page, data->addr, + data->val); + if (ret) + break; + } + + return ret; +} + +static int lan8842_erratas(struct phy_device *phydev) +{ + int ret; + + ret = lanphy_write_reg_data(phydev, short_center_tap_errata, + ARRAY_SIZE(short_center_tap_errata)); + if (ret) + return ret; + + return lanphy_write_reg_data(phydev, waketx_timer_errata, + ARRAY_SIZE(waketx_timer_errata)); +} + static int lan8842_config_init(struct phy_device *phydev) { int ret; @@ -5896,6 +6054,11 @@ static int lan8842_config_init(struct phy_device *phydev) if (ret < 0) return ret; + /* Apply the erratas for this device */ + ret = lan8842_erratas(phydev); + if (ret < 0) + return ret; + /* Even if the GPIOs are set to control the LEDs the behaviour of the * LEDs is wrong, they are not blinking when there is traffic. * To fix this it is required to set extended LED mode diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c index 0c8dc16ee7bd..2a873f791733 100644 --- a/drivers/net/phy/mxl-gpy.c +++ b/drivers/net/phy/mxl-gpy.c @@ -540,7 +540,7 @@ static int gpy_update_interface(struct phy_device *phydev) /* Interface mode is fixed for USXGMII and integrated PHY */ if (phydev->interface == PHY_INTERFACE_MODE_USXGMII || phydev->interface == PHY_INTERFACE_MODE_INTERNAL) - return -EINVAL; + return 0; /* Automatically switch SERDES interface between SGMII and 2500-BaseX * according to speed. Disable ANEG in 2500-BaseX mode. @@ -578,13 +578,7 @@ static int gpy_update_interface(struct phy_device *phydev) break; } - if (phydev->speed == SPEED_2500 || phydev->speed == SPEED_1000) { - ret = genphy_read_master_slave(phydev); - if (ret < 0) - return ret; - } - - return gpy_update_mdix(phydev); + return 0; } static int gpy_read_status(struct phy_device *phydev) @@ -639,6 +633,16 @@ static int gpy_read_status(struct phy_device *phydev) ret = gpy_update_interface(phydev); if (ret < 0) return ret; + + if (phydev->speed == SPEED_2500 || phydev->speed == SPEED_1000) { + ret = genphy_read_master_slave(phydev); + if (ret < 0) + return ret; + } + + ret = gpy_update_mdix(phydev); + if (ret < 0) + return ret; } return 0; diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 9d7799ea1c17..918244308215 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -637,6 +637,9 @@ static int phylink_validate(struct phylink *pl, unsigned long *supported, static void phylink_fill_fixedlink_supported(unsigned long *supported) { + linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, supported); linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, supported); diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 17f07eb0ee52..25562b17debe 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -1191,10 +1191,6 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return -EPERM; } - err = team_dev_type_check_change(dev, port_dev); - if (err) - return err; - if (port_dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port"); netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n", @@ -1212,10 +1208,16 @@ static int team_port_add(struct team *team, struct net_device *port_dev, INIT_LIST_HEAD(&port->qom_list); port->orig.mtu = port_dev->mtu; - err = dev_set_mtu(port_dev, dev->mtu); - if (err) { - netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); - goto err_set_mtu; + /* + * MTU assignment will be handled in team_dev_type_check_change + * if dev and port_dev are of different types + */ + if (dev->type == port_dev->type) { + err = dev_set_mtu(port_dev, dev->mtu); + if (err) { + netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); + goto err_set_mtu; + } } memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); @@ -1290,6 +1292,10 @@ static int team_port_add(struct team *team, struct net_device *port_dev, } } + err = team_dev_type_check_change(dev, port_dev); + if (err) + goto err_set_dev_type; + if (dev->flags & IFF_UP) { netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); @@ -1308,6 +1314,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return 0; +err_set_dev_type: err_set_slave_promisc: __team_option_inst_del_port(team, port); diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 81662328b2c7..a5f93b6c4482 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -244,7 +244,7 @@ tun_vnet_hdr_tnl_from_skb(unsigned int flags, if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload, tun_vnet_is_little_endian(flags), - vlan_hlen)) { + vlan_hlen, true)) { struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr; struct skb_shared_info *sinfo = skb_shinfo(skb); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 11352d85475a..3a4985b582cb 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -192,6 +192,12 @@ static int qmimux_rx_fixup(struct usbnet *dev, struct sk_buff *skb) if (!skbn) return 0; + /* Raw IP packets don't have a MAC header, but other subsystems + * (like xfrm) may still access MAC header offsets, so they must + * be initialized. + */ + skb_reset_mac_header(skbn); + switch (skb->data[offset + qmimux_hdr_sz] & 0xf0) { case 0x40: skbn->protocol = htons(ETH_P_IP); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a3046142cb8e..cc502bf022d5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -392,14 +392,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ __skb_push(skb, ETH_HLEN); - /* Depend on prior success packets started NAPI consumer via - * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, - * paired with empty check in veth_poll(). - */ netif_tx_stop_queue(txq); - smp_mb__after_atomic(); - if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) - netif_tx_wake_queue(txq); + /* Makes sure NAPI peer consumer runs. Consumer is responsible + * for starting txq again, until then ndo_start_xmit (this + * function) will not be invoked by the netstack again. + */ + __veth_xdp_flush(rq); break; case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: @@ -900,17 +898,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { - struct veth_priv *priv = netdev_priv(rq->dev); - int queue_idx = rq->xdp_rxq.queue_index; - struct netdev_queue *peer_txq; - struct net_device *peer_dev; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; - /* NAPI functions as RCU section */ - peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); - peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; - for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); @@ -959,9 +949,6 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); - if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq))) - netif_tx_wake_queue(peer_txq); - return done; } @@ -969,17 +956,28 @@ static int veth_poll(struct napi_struct *napi, int budget) { struct veth_rq *rq = container_of(napi, struct veth_rq, xdp_napi); + struct veth_priv *priv = netdev_priv(rq->dev); + int queue_idx = rq->xdp_rxq.queue_index; + struct netdev_queue *peer_txq; struct veth_stats stats = {}; + struct net_device *peer_dev; struct veth_xdp_tx_bq bq; int done; bq.count = 0; + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); + peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); if (stats.xdp_redirect > 0) xdp_do_flush(); + if (stats.xdp_tx > 0) + veth_xdp_flush(rq, &bq); + xdp_clear_return_frame_no_direct(); if (done < budget && napi_complete_done(napi, done)) { /* Write rx_notify_masked before reading ptr_ring */ @@ -992,9 +990,12 @@ static int veth_poll(struct napi_struct *napi, int budget) } } - if (stats.xdp_tx > 0) - veth_xdp_flush(rq, &bq); - xdp_clear_return_frame_no_direct(); + /* Release backpressure per NAPI poll */ + smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */ + if (peer_txq && netif_tx_queue_stopped(peer_txq)) { + txq_trans_cond_update(peer_txq); + netif_tx_wake_queue(peer_txq); + } return done; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8e8a179aaa49..8e04adb57f52 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -910,17 +910,6 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, goto ok; } - /* - * Verify that we can indeed put this data into a skb. - * This is here to handle cases when the device erroneously - * tries to receive more than is possible. This is usually - * the case of a broken device. - */ - if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { - net_dbg_ratelimited("%s: too much data\n", skb->dev->name); - dev_kfree_skb(skb); - return NULL; - } BUG_ON(offset >= PAGE_SIZE); while (len) { unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); @@ -2112,9 +2101,19 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page = buf; - struct sk_buff *skb = - page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); + struct sk_buff *skb; + /* Make sure that len does not exceed the size allocated in + * add_recvbuf_big. + */ + if (unlikely(len > (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE)) { + pr_debug("%s: rx error: len %u exceeds allocated size %lu\n", + dev->name, len, + (vi->big_packets_num_skbfrags + 1) * PAGE_SIZE); + goto err; + } + + skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, 0); u64_stats_add(&stats->bytes, len - vi->hdr_len); if (unlikely(!skb)) goto err; @@ -2539,6 +2538,13 @@ err_buf: return NULL; } +static inline u32 +virtio_net_hash_value(const struct virtio_net_hdr_v1_hash *hdr_hash) +{ + return __le16_to_cpu(hdr_hash->hash_value_lo) | + (__le16_to_cpu(hdr_hash->hash_value_hi) << 16); +} + static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { @@ -2565,7 +2571,7 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, default: rss_hash_type = PKT_HASH_TYPE_NONE; } - skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); + skb_set_hash(skb, virtio_net_hash_value(hdr_hash), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, @@ -2625,22 +2631,28 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, return; } - /* 1. Save the flags early, as the XDP program might overwrite them. + /* About the flags below: + * 1. Save the flags early, as the XDP program might overwrite them. * These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID * stay valid after XDP processing. * 2. XDP doesn't work with partially checksummed packets (refer to * virtnet_xdp_set()), so packets marked as * VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing. */ - flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; - if (vi->mergeable_rx_bufs) + if (vi->mergeable_rx_bufs) { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); - else if (vi->big_packets) + } else if (vi->big_packets) { + void *p = page_address((struct page *)buf); + + flags = ((struct virtio_net_common_hdr *)p)->hdr.flags; skb = receive_big(dev, vi, rq, buf, len, stats); - else + } else { + flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags; skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats); + } if (unlikely(!skb)) return; @@ -3311,6 +3323,10 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + /* Make sure it's safe to cast between formats */ + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr)); + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr.hdr)); + can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; @@ -3323,7 +3339,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) hdr = &skb_vnet_common_hdr(skb)->tnl_hdr; if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl, - virtio_is_little_endian(vi->vdev), 0)) + virtio_is_little_endian(vi->vdev), 0, + false)) return -EPROTO; if (vi->mergeable_rx_bufs) @@ -6750,7 +6767,7 @@ static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; - *hash = __le32_to_cpu(hdr_hash->hash_value); + *hash = virtio_net_hash_value(hdr_hash); return 0; } diff --git a/drivers/net/wan/framer/pef2256/pef2256.c b/drivers/net/wan/framer/pef2256/pef2256.c index c5501826db1e..c058cc79137d 100644 --- a/drivers/net/wan/framer/pef2256/pef2256.c +++ b/drivers/net/wan/framer/pef2256/pef2256.c @@ -648,7 +648,8 @@ static int pef2256_add_audio_devices(struct pef2256 *pef2256) audio_devs[i].id = i; } - ret = mfd_add_devices(pef2256->dev, 0, audio_devs, count, NULL, 0, NULL); + ret = devm_mfd_add_devices(pef2256->dev, 0, audio_devs, count, + NULL, 0, NULL); kfree(audio_devs); return ret; } @@ -822,8 +823,8 @@ static int pef2256_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pef2256); - ret = mfd_add_devices(pef2256->dev, 0, pef2256_devs, - ARRAY_SIZE(pef2256_devs), NULL, 0, NULL); + ret = devm_mfd_add_devices(pef2256->dev, 0, pef2256_devs, + ARRAY_SIZE(pef2256_devs), NULL, 0, NULL); if (ret) { dev_err(pef2256->dev, "add devices failed (%d)\n", ret); return ret; diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c index b3b00d324075..b4aad6604d6d 100644 --- a/drivers/net/wireless/ath/ath10k/wmi.c +++ b/drivers/net/wireless/ath/ath10k/wmi.c @@ -1764,32 +1764,33 @@ void ath10k_wmi_put_wmi_channel(struct ath10k *ar, struct wmi_channel *ch, int ath10k_wmi_wait_for_service_ready(struct ath10k *ar) { - unsigned long timeout = jiffies + WMI_SERVICE_READY_TIMEOUT_HZ; unsigned long time_left, i; - /* Sometimes the PCI HIF doesn't receive interrupt - * for the service ready message even if the buffer - * was completed. PCIe sniffer shows that it's - * because the corresponding CE ring doesn't fires - * it. Workaround here by polling CE rings. Since - * the message could arrive at any time, continue - * polling until timeout. - */ - do { + time_left = wait_for_completion_timeout(&ar->wmi.service_ready, + WMI_SERVICE_READY_TIMEOUT_HZ); + if (!time_left) { + /* Sometimes the PCI HIF doesn't receive interrupt + * for the service ready message even if the buffer + * was completed. PCIe sniffer shows that it's + * because the corresponding CE ring doesn't fires + * it. Workaround here by polling CE rings once. + */ + ath10k_warn(ar, "failed to receive service ready completion, polling..\n"); + for (i = 0; i < CE_COUNT; i++) ath10k_hif_send_complete_check(ar, i, 1); - /* The 100 ms granularity is a tradeoff considering scheduler - * overhead and response latency - */ time_left = wait_for_completion_timeout(&ar->wmi.service_ready, - msecs_to_jiffies(100)); - if (time_left) - return 0; - } while (time_before(jiffies, timeout)); + WMI_SERVICE_READY_TIMEOUT_HZ); + if (!time_left) { + ath10k_warn(ar, "polling timed out\n"); + return -ETIMEDOUT; + } + + ath10k_warn(ar, "service ready completion received, continuing normally\n"); + } - ath10k_warn(ar, "failed to receive service ready completion\n"); - return -ETIMEDOUT; + return 0; } int ath10k_wmi_wait_for_unified_ready(struct ath10k *ar) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index 0491e3fd6b5e..e3b444333dee 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -5961,6 +5961,9 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar, dma_unmap_single(ar->ab->dev, skb_cb->paddr, msdu->len, DMA_TO_DEVICE); info = IEEE80211_SKB_CB(msdu); + memset(&info->status, 0, sizeof(info->status)); + info->status.rates[0].idx = -1; + if ((!(info->flags & IEEE80211_TX_CTL_NO_ACK)) && !tx_compl_param->status) { info->flags |= IEEE80211_TX_STAT_ACK; diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index eacab798630a..db351c922018 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4064,68 +4064,12 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, return ret; } -static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) -{ - struct ath12k *ar = arvif->ar; - struct ieee80211_vif *vif = arvif->ahvif->vif; - struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; - enum wmi_sta_powersave_param param; - struct ieee80211_bss_conf *info; - enum wmi_sta_ps_mode psmode; - int ret; - int timeout; - bool enable_ps; - - lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); - - if (vif->type != NL80211_IFTYPE_STATION) - return; - - enable_ps = arvif->ahvif->ps; - if (enable_ps) { - psmode = WMI_STA_PS_MODE_ENABLED; - param = WMI_STA_PS_PARAM_INACTIVITY_TIME; - - timeout = conf->dynamic_ps_timeout; - if (timeout == 0) { - info = ath12k_mac_get_link_bss_conf(arvif); - if (!info) { - ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", - vif->addr, arvif->link_id); - return; - } - - /* firmware doesn't like 0 */ - timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; - } - - ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, - timeout); - if (ret) { - ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", - arvif->vdev_id, ret); - return; - } - } else { - psmode = WMI_STA_PS_MODE_DISABLED; - } - - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", - arvif->vdev_id, psmode ? "enable" : "disable"); - - ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); - if (ret) - ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", - psmode, arvif->vdev_id, ret); -} - static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); unsigned long links = ahvif->links_map; - struct ieee80211_vif_cfg *vif_cfg; struct ieee80211_bss_conf *info; struct ath12k_link_vif *arvif; struct ieee80211_sta *sta; @@ -4189,24 +4133,61 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, } } } +} - if (changed & BSS_CHANGED_PS) { - links = ahvif->links_map; - vif_cfg = &vif->cfg; +static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) +{ + struct ath12k *ar = arvif->ar; + struct ieee80211_vif *vif = arvif->ahvif->vif; + struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; + enum wmi_sta_powersave_param param; + struct ieee80211_bss_conf *info; + enum wmi_sta_ps_mode psmode; + int ret; + int timeout; + bool enable_ps; - for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { - arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); - if (!arvif || !arvif->ar) - continue; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); - ar = arvif->ar; + if (vif->type != NL80211_IFTYPE_STATION) + return; + + enable_ps = arvif->ahvif->ps; + if (enable_ps) { + psmode = WMI_STA_PS_MODE_ENABLED; + param = WMI_STA_PS_PARAM_INACTIVITY_TIME; - if (ar->ab->hw_params->supports_sta_ps) { - ahvif->ps = vif_cfg->ps; - ath12k_mac_vif_setup_ps(arvif); + timeout = conf->dynamic_ps_timeout; + if (timeout == 0) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; } + + /* firmware doesn't like 0 */ + timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; } + + ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, + timeout); + if (ret) { + ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", + arvif->vdev_id, ret); + return; + } + } else { + psmode = WMI_STA_PS_MODE_DISABLED; } + + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", + arvif->vdev_id, psmode ? "enable" : "disable"); + + ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); + if (ret) + ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", + psmode, arvif->vdev_id, ret); } static bool ath12k_mac_supports_tpc(struct ath12k *ar, struct ath12k_vif *ahvif, @@ -4228,6 +4209,7 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); + struct ieee80211_vif_cfg *vif_cfg = &vif->cfg; struct cfg80211_chan_def def; u32 param_id, param_value; enum nl80211_band band; @@ -4514,6 +4496,12 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, } ath12k_mac_fils_discovery(arvif, info); + + if (changed & BSS_CHANGED_PS && + ar->ab->hw_params->supports_sta_ps) { + ahvif->ps = vif_cfg->ps; + ath12k_mac_vif_setup_ps(arvif); + } } static struct ath12k_vif_cache *ath12k_ahvif_get_link_cache(struct ath12k_vif *ahvif, diff --git a/drivers/net/wireless/intel/iwlwifi/mld/link.c b/drivers/net/wireless/intel/iwlwifi/mld/link.c index 60d814bf5779..f6f52d297a72 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/link.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/link.c @@ -708,18 +708,13 @@ static int iwl_mld_get_chan_load_from_element(struct iwl_mld *mld, struct ieee80211_bss_conf *link_conf) { - struct ieee80211_vif *vif = link_conf->vif; const struct cfg80211_bss_ies *ies; const struct element *bss_load_elem = NULL; const struct ieee80211_bss_load_elem *bss_load; guard(rcu)(); - if (ieee80211_vif_link_active(vif, link_conf->link_id)) - ies = rcu_dereference(link_conf->bss->beacon_ies); - else - ies = rcu_dereference(link_conf->bss->ies); - + ies = rcu_dereference(link_conf->bss->beacon_ies); if (ies) bss_load_elem = cfg80211_find_elem(WLAN_EID_QBSS_LOAD, ies->data, ies->len); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index 9c9e0e1c6e1d..867807abde66 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@ -938,19 +938,12 @@ u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct iwl_mvm *mvm, u16 iwl_mvm_mac_ctxt_get_beacon_flags(const struct iwl_fw *fw, u8 rate_idx) { + u16 flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx); bool is_new_rate = iwl_fw_lookup_cmd_ver(fw, BEACON_TEMPLATE_CMD, 0) > 10; - u16 flags, cck_flag; - - if (is_new_rate) { - flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx); - cck_flag = IWL_MAC_BEACON_CCK; - } else { - cck_flag = IWL_MAC_BEACON_CCK_V1; - flags = iwl_fw_rate_idx_to_plcp(rate_idx); - } if (rate_idx <= IWL_LAST_CCK_RATE) - flags |= cck_flag; + flags |= is_new_rate ? IWL_MAC_BEACON_CCK + : IWL_MAC_BEACON_CCK_V1; return flags; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index 0c9c2492d8a7..0b12ee8ad618 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -463,7 +463,7 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm, if (!aux_roc_te) /* Not a Aux ROC time event */ return -EINVAL; - iwl_mvm_te_check_trigger(mvm, notif, te_data); + iwl_mvm_te_check_trigger(mvm, notif, aux_roc_te); IWL_DEBUG_TE(mvm, "Aux ROC time event notification - UID = 0x%x action %d (error = %d)\n", @@ -475,14 +475,14 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm, /* End TE, notify mac80211 */ ieee80211_remain_on_channel_expired(mvm->hw); iwl_mvm_roc_finished(mvm); /* flush aux queue */ - list_del(&te_data->list); /* remove from list */ - te_data->running = false; - te_data->vif = NULL; - te_data->uid = 0; - te_data->id = TE_MAX; + list_del(&aux_roc_te->list); /* remove from list */ + aux_roc_te->running = false; + aux_roc_te->vif = NULL; + aux_roc_te->uid = 0; + aux_roc_te->id = TE_MAX; } else if (le32_to_cpu(notif->action) == TE_V2_NOTIF_HOST_EVENT_START) { set_bit(IWL_MVM_STATUS_ROC_AUX_RUNNING, &mvm->status); - te_data->running = true; + aux_roc_te->running = true; ieee80211_ready_on_channel(mvm->hw); /* Start TE */ } else { IWL_DEBUG_TE(mvm, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index 22602c32faa5..fa995e235d9b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -159,9 +159,15 @@ int iwl_mvm_legacy_rate_to_mac80211_idx(u32 rate_n_flags, u8 iwl_mvm_mac80211_idx_to_hwrate(const struct iwl_fw *fw, int rate_idx) { - return (rate_idx >= IWL_FIRST_OFDM_RATE ? - rate_idx - IWL_FIRST_OFDM_RATE : - rate_idx); + if (iwl_fw_lookup_cmd_ver(fw, TX_CMD, 0) > 8) + /* In the new rate legacy rates are indexed: + * 0 - 3 for CCK and 0 - 7 for OFDM. + */ + return (rate_idx >= IWL_FIRST_OFDM_RATE ? + rate_idx - IWL_FIRST_OFDM_RATE : + rate_idx); + + return iwl_fw_rate_idx_to_plcp(rate_idx); } u8 iwl_mvm_mac80211_ac_to_ucode_ac(enum ieee80211_ac_numbers ac) diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 891e125ad30b..54d6d00ecdf1 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -2966,6 +2966,51 @@ mwl8k_cmd_rf_antenna(struct ieee80211_hw *hw, int antenna, int mask) /* * CMD_SET_BEACON. */ + +static bool mwl8k_beacon_has_ds_params(const u8 *buf, int len) +{ + const struct ieee80211_mgmt *mgmt = (const void *)buf; + int ies_len; + + if (len <= offsetof(struct ieee80211_mgmt, u.beacon.variable)) + return false; + + ies_len = len - offsetof(struct ieee80211_mgmt, u.beacon.variable); + + return cfg80211_find_ie(WLAN_EID_DS_PARAMS, mgmt->u.beacon.variable, + ies_len) != NULL; +} + +static void mwl8k_beacon_copy_inject_ds_params(struct ieee80211_hw *hw, + u8 *buf_dst, const u8 *buf_src, + int src_len) +{ + const struct ieee80211_mgmt *mgmt = (const void *)buf_src; + static const u8 before_ds_params[] = { + WLAN_EID_SSID, + WLAN_EID_SUPP_RATES, + }; + const u8 *ies; + int hdr_len, left, offs, pos; + + ies = mgmt->u.beacon.variable; + hdr_len = offsetof(struct ieee80211_mgmt, u.beacon.variable); + + offs = ieee80211_ie_split(ies, src_len - hdr_len, before_ds_params, + ARRAY_SIZE(before_ds_params), 0); + + pos = hdr_len + offs; + left = src_len - pos; + + memcpy(buf_dst, buf_src, pos); + + /* Inject a DSSS Parameter Set after SSID + Supp Rates */ + buf_dst[pos + 0] = WLAN_EID_DS_PARAMS; + buf_dst[pos + 1] = 1; + buf_dst[pos + 2] = hw->conf.chandef.chan->hw_value; + + memcpy(buf_dst + pos + 3, buf_src + pos, left); +} struct mwl8k_cmd_set_beacon { struct mwl8k_cmd_pkt_hdr header; __le16 beacon_len; @@ -2975,17 +3020,33 @@ struct mwl8k_cmd_set_beacon { static int mwl8k_cmd_set_beacon(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 *beacon, int len) { + bool ds_params_present = mwl8k_beacon_has_ds_params(beacon, len); struct mwl8k_cmd_set_beacon *cmd; - int rc; + int rc, final_len = len; - cmd = kzalloc(sizeof(*cmd) + len, GFP_KERNEL); + if (!ds_params_present) { + /* + * mwl8k firmware requires a DS Params IE with the current + * channel in AP beacons. If mac80211/hostapd does not + * include it, inject one here. IE ID + length + channel + * number = 3 bytes. + */ + final_len += 3; + } + + cmd = kzalloc(sizeof(*cmd) + final_len, GFP_KERNEL); if (cmd == NULL) return -ENOMEM; cmd->header.code = cpu_to_le16(MWL8K_CMD_SET_BEACON); - cmd->header.length = cpu_to_le16(sizeof(*cmd) + len); - cmd->beacon_len = cpu_to_le16(len); - memcpy(cmd->beacon, beacon, len); + cmd->header.length = cpu_to_le16(sizeof(*cmd) + final_len); + cmd->beacon_len = cpu_to_le16(final_len); + + if (ds_params_present) + memcpy(cmd->beacon, beacon, len); + else + mwl8k_beacon_copy_inject_ds_params(hw, cmd->beacon, beacon, + len); rc = mwl8k_post_pervif_cmd(hw, vif, &cmd->header); kfree(cmd); diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index ab904a7def1b..080c4f8a655a 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -7694,6 +7694,13 @@ int rtw89_hw_scan_add_chan_list_ax(struct rtw89_dev *rtwdev, INIT_LIST_HEAD(&list); list_for_each_entry_safe(ch_info, tmp, &scan_info->chan_list, list) { + /* The operating channel (tx_null == true) should + * not be last in the list, to avoid breaking + * RTL8851BU and RTL8832BU. + */ + if (list_len + 1 == RTW89_SCAN_LIST_LIMIT_AX && ch_info->tx_null) + break; + list_move_tail(&ch_info->list, &list); list_len++; diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index 9f856042a67a..5903d82e1ab1 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2003,8 +2003,14 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, struct ieee80211_sta *sta = control->sta; struct ieee80211_bss_conf *bss_conf; + /* This can happen in case of monitor injection */ + if (!vif) { + ieee80211_free_txskb(hw, skb); + return; + } + if (link != IEEE80211_LINK_UNSPECIFIED) { - bss_conf = rcu_dereference(txi->control.vif->link_conf[link]); + bss_conf = rcu_dereference(vif->link_conf[link]); if (sta) link_sta = rcu_dereference(sta->link[link]); } else { @@ -2065,13 +2071,13 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw, return; } - if (txi->control.vif) - hwsim_check_magic(txi->control.vif); + if (vif) + hwsim_check_magic(vif); if (control->sta) hwsim_check_sta_magic(control->sta); if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) - ieee80211_get_tx_rates(txi->control.vif, control->sta, skb, + ieee80211_get_tx_rates(vif, control->sta, skb, txi->control.rates, ARRAY_SIZE(txi->control.rates)); @@ -6698,14 +6704,15 @@ static struct genl_family hwsim_genl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), }; -static void remove_user_radios(u32 portid) +static void remove_user_radios(u32 portid, int netgroup) { struct mac80211_hwsim_data *entry, *tmp; LIST_HEAD(list); spin_lock_bh(&hwsim_radio_lock); list_for_each_entry_safe(entry, tmp, &hwsim_radios, list) { - if (entry->destroy_on_close && entry->portid == portid) { + if (entry->destroy_on_close && entry->portid == portid && + entry->netgroup == netgroup) { list_move(&entry->list, &list); rhashtable_remove_fast(&hwsim_radios_rht, &entry->rht, hwsim_rht_params); @@ -6730,7 +6737,7 @@ static int mac80211_hwsim_netlink_notify(struct notifier_block *nb, if (state != NETLINK_URELEASE) return NOTIFY_DONE; - remove_user_radios(notify->portid); + remove_user_radios(notify->portid, hwsim_net_get_netgroup(notify->net)); if (notify->portid == hwsim_net_get_wmediumd(notify->net)) { printk(KERN_INFO "mac80211_hwsim: wmediumd released netlink" diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c index 2faa0de2a36e..8ee15a15f4ca 100644 --- a/drivers/net/wireless/zydas/zd1211rw/zd_usb.c +++ b/drivers/net/wireless/zydas/zd1211rw/zd_usb.c @@ -791,6 +791,7 @@ error: if (urbs) { for (i = 0; i < RX_URBS_COUNT; i++) free_rx_urb(urbs[i]); + kfree(urbs); } return r; } diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c index c814fbd756a1..f8bc9a39bfa3 100644 --- a/drivers/net/wwan/mhi_wwan_mbim.c +++ b/drivers/net/wwan/mhi_wwan_mbim.c @@ -98,7 +98,7 @@ static struct mhi_mbim_link *mhi_mbim_get_link_rcu(struct mhi_mbim_context *mbim static int mhi_mbim_get_link_mux_id(struct mhi_controller *cntrl) { if (strcmp(cntrl->name, "foxconn-dw5934e") == 0 || - strcmp(cntrl->name, "foxconn-t99w515") == 0) + strcmp(cntrl->name, "foxconn-t99w640") == 0) return WDS_BIND_MUX_DATA_PORT_MUX_ID; return 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fa4181d7de73..f1f719351f3f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4901,7 +4901,6 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) */ nvme_stop_keep_alive(ctrl); blk_mq_destroy_queue(ctrl->admin_q); - blk_put_queue(ctrl->admin_q); if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); blk_put_queue(ctrl->fabrics_q); @@ -5045,6 +5044,8 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + if (ctrl->admin_q) + blk_put_queue(ctrl->admin_q); if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); nvme_free_cels(ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 03987f497a5b..2c903729b0b9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2355,17 +2355,11 @@ nvme_fc_ctrl_free(struct kref *ref) container_of(ref, struct nvme_fc_ctrl, ref); unsigned long flags; - if (ctrl->ctrl.tagset) - nvme_remove_io_tag_set(&ctrl->ctrl); - /* remove from rport list */ spin_lock_irqsave(&ctrl->rport->lock, flags); list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_unquiesce_admin_queue(&ctrl->ctrl); - nvme_remove_admin_tag_set(&ctrl->ctrl); - kfree(ctrl->queues); put_device(ctrl->dev); @@ -3259,13 +3253,20 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - cancel_work_sync(&ctrl->ioerr_work); cancel_delayed_work_sync(&ctrl->connect_work); + /* * kill the association on the link side. this will block * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + cancel_work_sync(&ctrl->ioerr_work); + + if (ctrl->ctrl.tagset) + nvme_remove_io_tag_set(&ctrl->ctrl); + + nvme_unquiesce_admin_queue(&ctrl->ctrl); + nvme_remove_admin_tag_set(&ctrl->ctrl); } static void diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 543e17aead12..e35eccacee8c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -793,7 +793,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) return; } nvme_add_ns_head_cdev(head); - kblockd_schedule_work(&head->partition_scan_work); + queue_work(nvme_wq, &head->partition_scan_work); } nvme_mpath_add_sysfs_link(ns->head); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index ceba21684e82..300d5e032f6d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -298,7 +298,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, const char *hash_name; u8 *challenge = req->sq->dhchap_c1; struct nvme_dhchap_key *transformed_key; - u8 buf[4], sc_c = ctrl->concat ? 1 : 0; + u8 buf[4]; int ret; hash_name = nvme_auth_hmac_name(ctrl->shash_id); @@ -367,7 +367,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, ret = crypto_shash_update(shash, buf, 2); if (ret) goto out; - *buf = sc_c; + *buf = req->sq->sc_c; ret = crypto_shash_update(shash, buf, 1); if (ret) goto out; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index bf01ec414c55..5946681cb0e3 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -43,6 +43,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) data->auth_protocol[0].dhchap.halen, data->auth_protocol[0].dhchap.dhlen); req->sq->dhchap_tid = le16_to_cpu(data->t_id); + req->sq->sc_c = data->sc_c; if (data->sc_c != NVME_AUTH_SECP_NOSC) { if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS)) return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 51df72f5e89b..f3b09f4099f0 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -159,6 +159,7 @@ struct nvmet_sq { bool authenticated; struct delayed_work auth_expired_work; u16 dhchap_tid; + u8 sc_c; u8 dhchap_status; u8 dhchap_step; u8 *dhchap_c1; diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c index f381ce1e84bd..7ebe53249035 100644 --- a/drivers/nvmem/layouts.c +++ b/drivers/nvmem/layouts.c @@ -51,7 +51,7 @@ static int nvmem_layout_bus_uevent(const struct device *dev, int ret; ret = of_device_uevent_modalias(dev, env); - if (ret != ENODEV) + if (ret != -ENODEV) return ret; return 0; diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 1cd93549d093..b174ec296489 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -479,6 +479,26 @@ out: } EXPORT_SYMBOL_GPL(of_irq_get); +const struct cpumask *of_irq_get_affinity(struct device_node *dev, int index) +{ + struct of_phandle_args oirq; + struct irq_fwspec_info info; + struct irq_fwspec fwspec; + int rc; + + rc = of_irq_parse_one(dev, index, &oirq); + if (rc) + return NULL; + + of_phandle_args_to_fwspec(oirq.np, oirq.args, oirq.args_count, + &fwspec); + + if (irq_populate_fwspec_info(&fwspec, &info)) + return NULL; + + return info.affinity; +} + /** * of_irq_get_byname - Decode a node's IRQ and return it as a Linux IRQ number * @dev: pointer to device tree node diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c index 22134e95574b..ccf71993ea35 100644 --- a/drivers/pci/controller/pcie-iproc.c +++ b/drivers/pci/controller/pcie-iproc.c @@ -17,6 +17,7 @@ #include <linux/irqchip/arm-gic-v3.h> #include <linux/platform_device.h> #include <linux/of_address.h> +#include <linux/of_irq.h> #include <linux/of_pci.h> #include <linux/of_platform.h> #include <linux/phy/phy.h> @@ -1337,29 +1338,16 @@ static int iproc_pcie_msi_steer(struct iproc_pcie *pcie, static int iproc_pcie_msi_enable(struct iproc_pcie *pcie) { - struct device_node *msi_node; + struct device_node *msi_node = NULL; int ret; /* * Either the "msi-parent" or the "msi-map" phandle needs to exist * for us to obtain the MSI node. */ - - msi_node = of_parse_phandle(pcie->dev->of_node, "msi-parent", 0); - if (!msi_node) { - const __be32 *msi_map = NULL; - int len; - u32 phandle; - - msi_map = of_get_property(pcie->dev->of_node, "msi-map", &len); - if (!msi_map) - return -ENODEV; - - phandle = be32_to_cpup(msi_map + 1); - msi_node = of_find_node_by_phandle(phandle); - if (!msi_node) - return -ENODEV; - } + of_msi_xlate(pcie->dev, &msi_node, 0); + if (!msi_node) + return -ENODEV; /* * Certain revisions of the iProc PCIe controller require additional diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index ce741ed9dc3f..a329060287b5 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -49,96 +49,6 @@ static void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg * __pci_write_msi_msg(desc, msg); } -/** - * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source - * @desc: Pointer to the MSI descriptor - * - * The ID number is only used within the irqdomain. - */ -static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc) -{ - struct pci_dev *dev = msi_desc_to_pci_dev(desc); - - return (irq_hw_number_t)desc->msi_index | - pci_dev_id(dev) << 11 | - ((irq_hw_number_t)(pci_domain_nr(dev->bus) & 0xFFFFFFFF)) << 27; -} - -static void pci_msi_domain_set_desc(msi_alloc_info_t *arg, - struct msi_desc *desc) -{ - arg->desc = desc; - arg->hwirq = pci_msi_domain_calc_hwirq(desc); -} - -static struct msi_domain_ops pci_msi_domain_ops_default = { - .set_desc = pci_msi_domain_set_desc, -}; - -static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info) -{ - struct msi_domain_ops *ops = info->ops; - - if (ops == NULL) { - info->ops = &pci_msi_domain_ops_default; - } else { - if (ops->set_desc == NULL) - ops->set_desc = pci_msi_domain_set_desc; - } -} - -static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info) -{ - struct irq_chip *chip = info->chip; - - BUG_ON(!chip); - if (!chip->irq_write_msi_msg) - chip->irq_write_msi_msg = pci_msi_domain_write_msg; - if (!chip->irq_mask) - chip->irq_mask = pci_msi_mask_irq; - if (!chip->irq_unmask) - chip->irq_unmask = pci_msi_unmask_irq; -} - -/** - * pci_msi_create_irq_domain - Create a MSI interrupt domain - * @fwnode: Optional fwnode of the interrupt controller - * @info: MSI domain info - * @parent: Parent irq domain - * - * Updates the domain and chip ops and creates a MSI interrupt domain. - * - * Returns: - * A domain pointer or NULL in case of failure. - */ -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent) -{ - if (WARN_ON(info->flags & MSI_FLAG_LEVEL_CAPABLE)) - info->flags &= ~MSI_FLAG_LEVEL_CAPABLE; - - if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) - pci_msi_domain_update_dom_ops(info); - if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) - pci_msi_domain_update_chip_ops(info); - - /* Let the core code free MSI descriptors when freeing interrupts */ - info->flags |= MSI_FLAG_FREE_MSI_DESCS; - - info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) - info->flags |= MSI_FLAG_MUST_REACTIVATE; - - /* PCI-MSI is oneshot-safe */ - info->chip->flags |= IRQCHIP_ONESHOT_SAFE; - /* Let the core update the bus token */ - info->bus_token = DOMAIN_BUS_PCI_MSI; - - return msi_create_irq_domain(fwnode, info, parent); -} -EXPORT_SYMBOL_GPL(pci_msi_create_irq_domain); - /* * Per device MSI[-X] domain functionality */ diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 4492b809094b..36f8c0985430 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -958,6 +958,7 @@ void pci_save_aspm_l1ss_state(struct pci_dev *dev); void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #ifdef CONFIG_PCIEASPM +void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap); void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); @@ -965,6 +966,7 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev); void pci_configure_ltr(struct pci_dev *pdev); void pci_bridge_reconfigure_ltr(struct pci_dev *pdev); #else +static inline void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap) { } static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 79b965158473..cedea47a3547 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -814,7 +814,6 @@ static void pcie_aspm_override_default_link_state(struct pcie_link_state *link) static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; - u32 parent_lnkcap, child_lnkcap; u16 parent_lnkctl, child_lnkctl; struct pci_bus *linkbus = parent->subordinate; @@ -829,9 +828,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * If ASPM not supported, don't mess with the clocks and link, * bail out now. */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); - if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS)) + if (!(parent->aspm_l0s_support && child->aspm_l0s_support) && + !(parent->aspm_l1_support && child->aspm_l1_support)) return; /* Configure common clock before checking latencies */ @@ -843,8 +841,6 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * read-only Link Capabilities may change depending on common clock * configuration (PCIe r5.0, sec 7.5.3.6). */ - pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); - pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); pcie_capability_read_word(parent, PCI_EXP_LNKCTL, &parent_lnkctl); pcie_capability_read_word(child, PCI_EXP_LNKCTL, &child_lnkctl); @@ -864,7 +860,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * given link unless components on both sides of the link each * support L0s. */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) + if (parent->aspm_l0s_support && child->aspm_l0s_support) link->aspm_support |= PCIE_LINK_STATE_L0S; if (child_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S) @@ -873,7 +869,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) link->aspm_enabled |= PCIE_LINK_STATE_L0S_DW; /* Setup L1 state */ - if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1) + if (parent->aspm_l1_support && child->aspm_l1_support) link->aspm_support |= PCIE_LINK_STATE_L1; if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1) @@ -1530,6 +1526,19 @@ int pci_enable_link_state_locked(struct pci_dev *pdev, int state) } EXPORT_SYMBOL(pci_enable_link_state_locked); +void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap) +{ + if (lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) + pdev->aspm_l0s_support = 0; + if (lnkcap & PCI_EXP_LNKCAP_ASPM_L1) + pdev->aspm_l1_support = 0; + + pci_info(pdev, "ASPM: Link Capabilities%s%s treated as unsupported to avoid device defect\n", + lnkcap & PCI_EXP_LNKCAP_ASPM_L0S ? " L0s" : "", + lnkcap & PCI_EXP_LNKCAP_ASPM_L1 ? " L1" : ""); + +} + static int pcie_aspm_set_policy(const char *val, const struct kernel_param *kp) { diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0ce98e18b5a8..9cd032dff31e 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1656,6 +1656,13 @@ void set_pcie_port_type(struct pci_dev *pdev) if (reg32 & PCI_EXP_LNKCAP_DLLLARC) pdev->link_active_reporting = 1; +#ifdef CONFIG_PCIEASPM + if (reg32 & PCI_EXP_LNKCAP_ASPM_L0S) + pdev->aspm_l0s_support = 1; + if (reg32 & PCI_EXP_LNKCAP_ASPM_L1) + pdev->aspm_l1_support = 1; +#endif + parent = pci_upstream_bridge(pdev); if (!parent) return; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 214ed060ca1b..b9c252aa6fe0 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2494,28 +2494,27 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, */ static void quirk_disable_aspm_l0s(struct pci_dev *dev) { - pci_info(dev, "Disabling L0s\n"); - pci_disable_link_state(dev, PCIE_LINK_STATE_L0S); -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s); + pcie_aspm_remove_cap(dev, PCI_EXP_LNKCAP_ASPM_L0S); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s); static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) { - pci_info(dev, "Disabling ASPM L0s/L1\n"); - pci_disable_link_state(dev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1); + pcie_aspm_remove_cap(dev, + PCI_EXP_LNKCAP_ASPM_L0S | PCI_EXP_LNKCAP_ASPM_L1); } /* @@ -2523,7 +2522,10 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev) * upstream PCIe root port when ASPM is enabled. At least L0s mode is affected; * disable both L0s and L1 for now to be safe. */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, 0x0451, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PASEMI, 0xa002, quirk_disable_aspm_l0s_l1); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HUAWEI, 0x1105, quirk_disable_aspm_l0s_l1); /* * Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 5c310e803dd7..f7abd1333963 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -26,7 +26,8 @@ #include <asm/irq_regs.h> -static int armpmu_count_irq_users(const int irq); +static int armpmu_count_irq_users(const struct cpumask *affinity, + const int irq); struct pmu_irq_ops { void (*enable_pmuirq)(unsigned int irq); @@ -64,7 +65,9 @@ static void armpmu_enable_percpu_pmuirq(unsigned int irq) static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { - if (armpmu_count_irq_users(irq) == 1) + struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu); + + if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1) free_percpu_irq(irq, devid); } @@ -89,7 +92,9 @@ static void armpmu_disable_percpu_pmunmi(unsigned int irq) static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { - if (armpmu_count_irq_users(irq) == 1) + struct arm_pmu *armpmu = *per_cpu_ptr((void * __percpu *)devid, cpu); + + if (armpmu_count_irq_users(&armpmu->supported_cpus, irq) == 1) free_percpu_nmi(irq, devid); } @@ -99,7 +104,6 @@ static const struct pmu_irq_ops percpu_pmunmi_ops = { .free_pmuirq = armpmu_free_percpu_pmunmi }; -DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); @@ -580,11 +584,11 @@ static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; -static int armpmu_count_irq_users(const int irq) +static int armpmu_count_irq_users(const struct cpumask *affinity, const int irq) { int cpu, count = 0; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, affinity) { if (per_cpu(cpu_irq, cpu) == irq) count++; } @@ -592,12 +596,13 @@ static int armpmu_count_irq_users(const int irq) return count; } -static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) +static const struct pmu_irq_ops * +armpmu_find_irq_ops(const struct cpumask *affinity, int irq) { const struct pmu_irq_ops *ops = NULL; int cpu; - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, affinity) { if (per_cpu(cpu_irq, cpu) != irq) continue; @@ -609,22 +614,25 @@ static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) return ops; } -void armpmu_free_irq(int irq, int cpu) +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) return; if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; - per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); + per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, armpmu); per_cpu(cpu_irq, cpu) = 0; per_cpu(cpu_irq_ops, cpu) = NULL; } -int armpmu_request_irq(int irq, int cpu) +int armpmu_request_irq(struct arm_pmu * __percpu *pcpu_armpmu, int irq, int cpu) { int err = 0; + struct arm_pmu **armpmu = per_cpu_ptr(pcpu_armpmu, cpu); + const struct cpumask *affinity = *armpmu ? &(*armpmu)->supported_cpus : + cpu_possible_mask; /* ACPI */ const irq_handler_t handler = armpmu_dispatch_irq; const struct pmu_irq_ops *irq_ops; @@ -646,25 +654,24 @@ int armpmu_request_irq(int irq, int cpu) IRQF_NOBALANCING | IRQF_NO_AUTOEN | IRQF_NO_THREAD; - err = request_nmi(irq, handler, irq_flags, "arm-pmu", - per_cpu_ptr(&cpu_armpmu, cpu)); + err = request_nmi(irq, handler, irq_flags, "arm-pmu", armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_irq(irq, handler, irq_flags, "arm-pmu", - per_cpu_ptr(&cpu_armpmu, cpu)); + armpmu); irq_ops = &pmuirq_ops; } else { has_nmi = true; irq_ops = &pmunmi_ops; } - } else if (armpmu_count_irq_users(irq) == 0) { - err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); + } else if (armpmu_count_irq_users(affinity, irq) == 0) { + err = request_percpu_nmi(irq, handler, "arm-pmu", affinity, pcpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { - err = request_percpu_irq(irq, handler, "arm-pmu", - &cpu_armpmu); + err = request_percpu_irq_affinity(irq, handler, "arm-pmu", + affinity, pcpu_armpmu); irq_ops = &percpu_pmuirq_ops; } else { has_nmi = true; @@ -672,7 +679,7 @@ int armpmu_request_irq(int irq, int cpu) } } else { /* Per cpudevid irq was already requested by another CPU */ - irq_ops = armpmu_find_irq_ops(irq); + irq_ops = armpmu_find_irq_ops(affinity, irq); if (WARN_ON(!irq_ops)) err = -EINVAL; @@ -717,8 +724,6 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) if (pmu->reset) pmu->reset(pmu); - per_cpu(cpu_armpmu, cpu) = pmu; - irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); @@ -738,8 +743,6 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) if (irq) per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); - per_cpu(cpu_armpmu, cpu) = NULL; - return 0; } diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 05dda19c5359..e80f76d95e68 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -218,7 +218,7 @@ static int arm_pmu_acpi_parse_irqs(void) * them with their PMUs. */ per_cpu(pmu_irqs, cpu) = irq; - err = armpmu_request_irq(irq, cpu); + err = armpmu_request_irq(&probed_pmus, irq, cpu); if (err) goto out_err; } diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c index 118170a5cede..1c9e50a13201 100644 --- a/drivers/perf/arm_pmu_platform.c +++ b/drivers/perf/arm_pmu_platform.c @@ -42,14 +42,13 @@ static int probe_current_pmu(struct arm_pmu *pmu, return ret; } -static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq) +static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq, + const struct cpumask *affinity) { - int cpu, ret; struct pmu_hw_events __percpu *hw_events = pmu->hw_events; + int cpu; - ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus); - if (ret) - return ret; + cpumask_copy(&pmu->supported_cpus, affinity); for_each_cpu(cpu, &pmu->supported_cpus) per_cpu(hw_events->irq, cpu) = irq; @@ -115,9 +114,12 @@ static int pmu_parse_irqs(struct arm_pmu *pmu) } if (num_irqs == 1) { - int irq = platform_get_irq(pdev, 0); + const struct cpumask *affinity; + int irq; + + irq = platform_get_irq_affinity(pdev, 0, &affinity); if ((irq > 0) && irq_is_percpu_devid(irq)) - return pmu_parse_percpu_irq(pmu, irq); + return pmu_parse_percpu_irq(pmu, irq, affinity); } if (nr_cpu_ids != 1 && !pmu_has_irq_affinity(dev->of_node)) @@ -163,7 +165,7 @@ static int armpmu_request_irqs(struct arm_pmu *armpmu) if (!irq) continue; - err = armpmu_request_irq(irq, cpu); + err = armpmu_request_irq(&hw_events->percpu_pmu, irq, cpu); if (err) break; } @@ -179,7 +181,7 @@ static void armpmu_free_irqs(struct arm_pmu *armpmu) for_each_cpu(cpu, &armpmu->supported_cpus) { int irq = per_cpu(hw_events->irq, cpu); - armpmu_free_irq(irq, cpu); + armpmu_free_irq(&hw_events->percpu_pmu, irq, cpu); } } diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 69c5cc8f5606..ca8d706d4022 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1064,7 +1064,7 @@ static int armv8pmu_user_event_idx(struct perf_event *event) static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in) { - struct arm_pmu *armpmu = *this_cpu_ptr(&cpu_armpmu); + struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); if (!hw_events->branch_users) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index fa50645fedda..87908f0712c0 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -1259,8 +1259,8 @@ static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu) return -ENXIO; /* Request our PPIs (note that the IRQ is still disabled) */ - ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME, - spe_pmu->handle); + ret = request_percpu_irq_affinity(spe_pmu->irq, arm_spe_pmu_irq_handler, + DRVNAME, mask, spe_pmu->handle); if (ret) return ret; @@ -1287,8 +1287,10 @@ static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu) static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu) { struct platform_device *pdev = spe_pmu->pdev; - int irq = platform_get_irq(pdev, 0); + const struct cpumask *affinity; + int irq; + irq = platform_get_irq_affinity(pdev, 0, &affinity); if (irq < 0) return -ENXIO; @@ -1297,10 +1299,7 @@ static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu) return -EINVAL; } - if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) { - dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq); - return -EINVAL; - } + cpumask_copy(&spe_pmu->supported_cpus, affinity); spe_pmu->irq = irq; return 0; diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index e255c1b069ec..7dd282da67ce 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1109,7 +1109,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) /* compute hardware counter index */ hidx = info->csr - CSR_CYCLE; - /* check if the corresponding bit is set in sscountovf or overflow mask in shmem */ + /* check if the corresponding bit is set in scountovf or overflow mask in shmem */ if (!(overflow & BIT(hidx))) continue; diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c index 68abb6d6cecd..a8f82104a384 100644 --- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c +++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c @@ -532,6 +532,11 @@ static int cs42l43_gpio_add_pin_ranges(struct gpio_chip *chip) return ret; } +static void cs42l43_fwnode_put(void *data) +{ + fwnode_handle_put(data); +} + static int cs42l43_pin_probe(struct platform_device *pdev) { struct cs42l43 *cs42l43 = dev_get_drvdata(pdev->dev.parent); @@ -563,10 +568,20 @@ static int cs42l43_pin_probe(struct platform_device *pdev) priv->gpio_chip.ngpio = CS42L43_NUM_GPIOS; if (is_of_node(fwnode)) { - fwnode = fwnode_get_named_child_node(fwnode, "pinctrl"); - - if (fwnode && !fwnode->dev) - fwnode->dev = priv->dev; + struct fwnode_handle *child; + + child = fwnode_get_named_child_node(fwnode, "pinctrl"); + if (child) { + ret = devm_add_action_or_reset(&pdev->dev, + cs42l43_fwnode_put, child); + if (ret) { + fwnode_handle_put(child); + return ret; + } + if (!child->dev) + child->dev = priv->dev; + fwnode = child; + } } priv->gpio_chip.fwnode = fwnode; diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8189.c b/drivers/pinctrl/mediatek/pinctrl-mt8189.c index 7028aff55ae5..f6a3e584588b 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8189.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8189.c @@ -1642,9 +1642,7 @@ static const struct mtk_pin_reg_calc mt8189_reg_cals[PINCTRL_PIN_REG_MAX] = { }; static const char * const mt8189_pinctrl_register_base_names[] = { - "gpio_base", "iocfg_bm0_base", "iocfg_bm1_base", "iocfg_bm2_base", "iocfg_lm_base", - "iocfg_lt0_base", "iocfg_lt1_base", "iocfg_rb0_base", "iocfg_rb1_base", - "iocfg_rt_base" + "base", "lm", "rb0", "rb1", "bm0", "bm1", "bm2", "lt0", "lt1", "rt", }; static const struct mtk_eint_hw mt8189_eint_hw = { diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8196.c b/drivers/pinctrl/mediatek/pinctrl-mt8196.c index 82a73929c7a0..dec957c1724b 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt8196.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt8196.c @@ -1801,10 +1801,8 @@ static const struct mtk_pin_reg_calc mt8196_reg_cals[PINCTRL_PIN_REG_MAX] = { }; static const char * const mt8196_pinctrl_register_base_names[] = { - "iocfg0", "iocfg_rt", "iocfg_rm1", "iocfg_rm2", - "iocfg_rb", "iocfg_bm1", "iocfg_bm2", "iocfg_bm3", - "iocfg_lt", "iocfg_lm1", "iocfg_lm2", "iocfg_lb1", - "iocfg_lb2", "iocfg_tm1", "iocfg_tm2", "iocfg_tm3", + "base", "rt", "rm1", "rm2", "rb", "bm1", "bm2", "bm3", + "lt", "lm1", "lm2", "lb1", "lb2", "tm1", "tm2", "tm3", }; static const struct mtk_eint_hw mt8196_eint_hw = { diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c index 501eb296c760..35511f83d056 100644 --- a/drivers/pinctrl/nxp/pinctrl-s32cc.c +++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c @@ -392,6 +392,7 @@ static int s32_pmx_gpio_request_enable(struct pinctrl_dev *pctldev, gpio_pin->pin_id = offset; gpio_pin->config = config; + INIT_LIST_HEAD(&gpio_pin->list); spin_lock_irqsave(&ipctl->gpio_configs_lock, flags); list_add(&gpio_pin->list, &ipctl->gpio_configs); @@ -951,7 +952,7 @@ int s32_pinctrl_probe(struct platform_device *pdev, spin_lock_init(&ipctl->gpio_configs_lock); s32_pinctrl_desc = - devm_kmalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL); + devm_kzalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL); if (!s32_pinctrl_desc) return -ENOMEM; diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 67525d542c5b..e99871b90ab9 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -189,7 +189,7 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev, */ if (d && i != gpio_func && !test_and_set_bit(d->hwirq, pctrl->disabled_for_mux)) - disable_irq(irq); + disable_irq_nosync(irq); raw_spin_lock_irqsave(&pctrl->lock, flags); diff --git a/drivers/pinctrl/realtek/Kconfig b/drivers/pinctrl/realtek/Kconfig index 0fc6bd4fcb7e..400c9e5b16ad 100644 --- a/drivers/pinctrl/realtek/Kconfig +++ b/drivers/pinctrl/realtek/Kconfig @@ -6,6 +6,7 @@ config PINCTRL_RTD default y select PINMUX select GENERIC_PINCONF + select REGMAP_MMIO config PINCTRL_RTD1619B tristate "Realtek DHC 1619B pin controller driver" diff --git a/drivers/platform/arm64/lenovo-thinkpad-t14s.c b/drivers/platform/arm64/lenovo-thinkpad-t14s.c index 1d5d11adaf32..cf6a1d3b2617 100644 --- a/drivers/platform/arm64/lenovo-thinkpad-t14s.c +++ b/drivers/platform/arm64/lenovo-thinkpad-t14s.c @@ -120,6 +120,7 @@ static int t14s_ec_write(void *context, unsigned int reg, if (ret < 0) return ret; + fsleep(10000); return 0; } @@ -157,6 +158,7 @@ static int t14s_ec_read(void *context, unsigned int reg, out: i2c_unlock_bus(client->adapter, I2C_LOCK_SEGMENT); + fsleep(10000); return ret; } @@ -191,6 +193,8 @@ static int t14s_ec_read_evt(struct t14s_ec *ec, u8 *val) if (ret < 0) goto out; + fsleep(10000); + ret = 0; out: @@ -557,12 +561,6 @@ static int t14s_ec_probe(struct i2c_client *client) return dev_err_probe(dev, PTR_ERR(ec->regmap), "Failed to init regmap\n"); - ret = devm_request_threaded_irq(dev, client->irq, NULL, - t14s_ec_irq_handler, - IRQF_ONESHOT, dev_name(dev), ec); - if (ret < 0) - return dev_err_probe(dev, ret, "Failed to get IRQ\n"); - ret = t14s_leds_probe(ec); if (ret < 0) return ret; @@ -579,6 +577,12 @@ static int t14s_ec_probe(struct i2c_client *client) if (ret < 0) return ret; + ret = devm_request_threaded_irq(dev, client->irq, NULL, + t14s_ec_irq_handler, + IRQF_ONESHOT, dev_name(dev), ec); + if (ret < 0) + return dev_err_probe(dev, ret, "Failed to get IRQ\n"); + /* * Disable wakeup support by default, because the driver currently does * not support masking any events and the laptop should not wake up when diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 46e62feeda3c..c883a28e0916 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -432,7 +432,7 @@ config WIRELESS_HOTKEY depends on INPUT help This driver provides supports for the wireless buttons found on some AMD, - HP, & Xioami laptops. + HP, & Xiaomi laptops. On such systems the driver should load automatically (via ACPI alias). To compile this driver as a module, choose M here: the module will @@ -545,6 +545,7 @@ config MSI_WMI config MSI_WMI_PLATFORM tristate "MSI WMI Platform features" depends on ACPI_WMI + depends on DMI depends on HWMON help Say Y here if you want to have support for WMI-based platform features diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 13eb22b35aa8..d848afc91f87 100644 --- a/drivers/platform/x86/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c @@ -102,6 +102,7 @@ MODULE_ALIAS("wmi:676AA15E-6A47-4D9F-A2CC-1E6D18D14026"); enum acer_wmi_event_ids { WMID_HOTKEY_EVENT = 0x1, + WMID_BACKLIGHT_EVENT = 0x4, WMID_ACCEL_OR_KBD_DOCK_EVENT = 0x5, WMID_GAMING_TURBO_KEY_EVENT = 0x7, WMID_AC_EVENT = 0x8, @@ -2369,6 +2370,9 @@ static void acer_wmi_notify(union acpi_object *obj, void *context) sparse_keymap_report_event(acer_wmi_input_dev, scancode, 1, true); } break; + case WMID_BACKLIGHT_EVENT: + /* Already handled by acpi-video */ + break; case WMID_ACCEL_OR_KBD_DOCK_EVENT: acer_gsensor_event(); acer_kbd_dock_event(&return_value); diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index d63aaad7ef59..404e62ad293a 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -122,6 +122,14 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21A1"), } }, + { + .ident = "ROG Xbox Ally RC73YA", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "RC73YA"), + } + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=218024 */ { .ident = "V14 G4 AMN", @@ -204,6 +212,23 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_PRODUCT_NAME, "82ND"), } }, + /* https://gitlab.freedesktop.org/drm/amd/-/issues/4618 */ + { + .ident = "Lenovo Legion Go 2", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83N0"), + } + }, + { + .ident = "Lenovo Legion Go 2", + .driver_data = &quirk_s2idle_bug, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "83N1"), + } + }, /* https://gitlab.freedesktop.org/drm/amd/-/issues/2684 */ { .ident = "HP Laptop 15s-eq2xxx", diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index bd318fd02ccf..cae3fcafd4d7 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -106,6 +106,7 @@ static void amd_pmc_get_ip_info(struct amd_pmc_dev *dev) switch (dev->cpu_id) { case AMD_CPU_ID_PCO: case AMD_CPU_ID_RN: + case AMD_CPU_ID_VG: case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: dev->num_ips = 12; @@ -517,6 +518,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev) case AMD_CPU_ID_PCO: return MSG_OS_HINT_PCO; case AMD_CPU_ID_RN: + case AMD_CPU_ID_VG: case AMD_CPU_ID_YC: case AMD_CPU_ID_CB: case AMD_CPU_ID_PS: @@ -717,6 +719,7 @@ static const struct pci_device_id pmc_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RV) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SP) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SHP) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_VG) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, { } diff --git a/drivers/platform/x86/amd/pmc/pmc.h b/drivers/platform/x86/amd/pmc/pmc.h index 62f3e51020fd..fe3f53eb5955 100644 --- a/drivers/platform/x86/amd/pmc/pmc.h +++ b/drivers/platform/x86/amd/pmc/pmc.h @@ -156,6 +156,7 @@ void amd_mp2_stb_deinit(struct amd_pmc_dev *dev); #define AMD_CPU_ID_RN 0x1630 #define AMD_CPU_ID_PCO AMD_CPU_ID_RV #define AMD_CPU_ID_CZN AMD_CPU_ID_RN +#define AMD_CPU_ID_VG 0x1645 #define AMD_CPU_ID_YC 0x14B5 #define AMD_CPU_ID_CB 0x14D8 #define AMD_CPU_ID_PS 0x14E8 diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index f417dcc9af35..fadf7aac6779 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -90,34 +90,34 @@ static struct awcc_quirks empty_quirks; static const struct dmi_system_id awcc_dmi_table[] __initconst = { { - .ident = "Alienware Area-51m", + .ident = "Alienware 16 Aurora", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware 16 Aurora"), }, - .driver_data = &generic_quirks, + .driver_data = &g_series_quirks, }, { - .ident = "Alienware Area-51m R2", + .ident = "Alienware Area-51m", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m R2"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware m15 R5", + .ident = "Alienware m15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R5"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware m15 R7", + .ident = "Alienware m16 R1 AMD", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R7"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), }, .driver_data = &generic_quirks, }, @@ -130,14 +130,6 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &g_series_quirks, }, { - .ident = "Alienware m16 R1 AMD", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"), - }, - .driver_data = &generic_quirks, - }, - { .ident = "Alienware m16 R2", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), @@ -146,114 +138,66 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { .driver_data = &generic_quirks, }, { - .ident = "Alienware m17 R5", + .ident = "Alienware m17", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17 R5 AMD"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware m18 R2", + .ident = "Alienware m18", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18 R2"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware x15 R1", + .ident = "Alienware x15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R1"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware x15 R2", + .ident = "Alienware x17", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R2"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17"), }, .driver_data = &generic_quirks, }, { - .ident = "Alienware x17 R2", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), - DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17 R2"), - }, - .driver_data = &generic_quirks, - }, - { - .ident = "Dell Inc. G15 5510", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5510"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G15 5511", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5511"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G15 5515", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5515"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G15 5530", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5530"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G16 7630", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16 7630"), - }, - .driver_data = &g_series_quirks, - }, - { - .ident = "Dell Inc. G3 3500", + .ident = "Dell Inc. G15", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G3 3500"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G3 3590", + .ident = "Dell Inc. G16", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G3 3590"), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G5 5500", + .ident = "Dell Inc. G3", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G5 5500"), + DMI_MATCH(DMI_PRODUCT_NAME, "G3"), }, .driver_data = &g_series_quirks, }, { - .ident = "Dell Inc. G5 5505", + .ident = "Dell Inc. G5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "G5 5505"), + DMI_MATCH(DMI_PRODUCT_NAME, "G5"), }, .driver_data = &g_series_quirks, }, diff --git a/drivers/platform/x86/dell/dell-wmi-base.c b/drivers/platform/x86/dell/dell-wmi-base.c index 841a5414d28a..28076929d6af 100644 --- a/drivers/platform/x86/dell/dell-wmi-base.c +++ b/drivers/platform/x86/dell/dell-wmi-base.c @@ -365,6 +365,13 @@ static const struct key_entry dell_wmi_keymap_type_0012[] = { /* Backlight brightness change event */ { KE_IGNORE, 0x0003, { KEY_RESERVED } }, + /* + * Electronic privacy screen toggled, extended data gives state, + * separate entries for on/off see handling in dell_wmi_process_key(). + */ + { KE_KEY, 0x000c, { KEY_EPRIVACY_SCREEN_OFF } }, + { KE_KEY, 0x000c, { KEY_EPRIVACY_SCREEN_ON } }, + /* Ultra-performance mode switch request */ { KE_IGNORE, 0x000d, { KEY_RESERVED } }, @@ -435,6 +442,11 @@ static int dell_wmi_process_key(struct wmi_device *wdev, int type, int code, u16 "Dell tablet mode switch", SW_TABLET_MODE, !buffer[0]); return 1; + } else if (type == 0x0012 && code == 0x000c && remaining > 0) { + /* Eprivacy toggle, switch to "on" key entry for on events */ + if (buffer[0] == 2) + key++; + used = 1; } else if (type == 0x0012 && code == 0x000d && remaining > 0) { value = (buffer[2] == 2); used = 1; diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 8b3533d6ba09..ad9d9f97960f 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -92,9 +92,11 @@ static const char * const victus_thermal_profile_boards[] = { "8A25" }; -/* DMI Board names of Victus 16-r1000 and Victus 16-s1000 laptops */ +/* DMI Board names of Victus 16-r and Victus 16-s laptops */ static const char * const victus_s_thermal_profile_boards[] = { - "8C99", "8C9C" + "8BBE", "8BD4", "8BD5", + "8C78", "8C99", "8C9C", + "8D41", }; enum hp_wmi_radio { diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c index c3772df34679..8a4c54089ace 100644 --- a/drivers/platform/x86/huawei-wmi.c +++ b/drivers/platform/x86/huawei-wmi.c @@ -81,6 +81,10 @@ static const struct key_entry huawei_wmi_keymap[] = { { KE_KEY, 0x289, { KEY_WLAN } }, // Huawei |M| key { KE_KEY, 0x28a, { KEY_CONFIG } }, + // HONOR YOYO key + { KE_KEY, 0x28b, { KEY_NOTIFICATION_CENTER } }, + // HONOR print screen + { KE_KEY, 0x28e, { KEY_PRINT } }, // Keyboard backlit { KE_IGNORE, 0x293, { KEY_KBDILLUMTOGGLE } }, { KE_IGNORE, 0x294, { KEY_KBDILLUMUP } }, diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index f25a427cccda..9c07a7faf18f 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -55,6 +55,7 @@ static const struct acpi_device_id intel_hid_ids[] = { { "INTC10CB" }, { "INTC10CC" }, { "INTC10F1" }, + { "INTC10F2" }, { } }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); diff --git a/drivers/platform/x86/intel/int3472/clk_and_regulator.c b/drivers/platform/x86/intel/int3472/clk_and_regulator.c index 476ec24d3702..9e052b164a1a 100644 --- a/drivers/platform/x86/intel/int3472/clk_and_regulator.c +++ b/drivers/platform/x86/intel/int3472/clk_and_regulator.c @@ -245,15 +245,12 @@ int skl_int3472_register_regulator(struct int3472_discrete_device *int3472, if (IS_ERR(regulator->rdev)) return PTR_ERR(regulator->rdev); - int3472->regulators[int3472->n_regulator_gpios].ena_gpio = gpio; int3472->n_regulator_gpios++; return 0; } void skl_int3472_unregister_regulator(struct int3472_discrete_device *int3472) { - for (int i = 0; i < int3472->n_regulator_gpios; i++) { + for (int i = 0; i < int3472->n_regulator_gpios; i++) regulator_unregister(int3472->regulators[i].rdev); - gpiod_put(int3472->regulators[i].ena_gpio); - } } diff --git a/drivers/platform/x86/intel/int3472/led.c b/drivers/platform/x86/intel/int3472/led.c index f1d6d7b0cb75..b1d84b968112 100644 --- a/drivers/platform/x86/intel/int3472/led.c +++ b/drivers/platform/x86/intel/int3472/led.c @@ -43,7 +43,7 @@ int skl_int3472_register_pled(struct int3472_discrete_device *int3472, struct gp int3472->pled.lookup.provider = int3472->pled.name; int3472->pled.lookup.dev_id = int3472->sensor_name; - int3472->pled.lookup.con_id = "privacy-led"; + int3472->pled.lookup.con_id = "privacy"; led_add_lookup(&int3472->pled.lookup); return 0; diff --git a/drivers/platform/x86/intel/punit_ipc.c b/drivers/platform/x86/intel/punit_ipc.c index bafac8aa2baf..14513010daad 100644 --- a/drivers/platform/x86/intel/punit_ipc.c +++ b/drivers/platform/x86/intel/punit_ipc.c @@ -250,7 +250,7 @@ static int intel_punit_ipc_probe(struct platform_device *pdev) } else { ret = devm_request_irq(&pdev->dev, irq, intel_punit_ioc, IRQF_NO_SUSPEND, "intel_punit_ipc", - &punit_ipcdev); + punit_ipcdev); if (ret) { dev_err(&pdev->dev, "Failed to request irq: %d\n", irq); return ret; diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c index 3f4343147dad..950ede5eab76 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c @@ -108,11 +108,11 @@ static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_read_config_dword(pdev, 0xD0, &mmio_base); if (ret) - return ret; + return pcibios_err_to_errno(ret); ret = pci_read_config_dword(pdev, 0xFC, &pcu_base); if (ret) - return ret; + return pcibios_err_to_errno(ret); pcu_base &= GENMASK(10, 0); base_addr = (u64)mmio_base << 23 | (u64) pcu_base << 12; diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h index 70ae11519837..0abe850ef54e 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h @@ -40,7 +40,7 @@ * @agent_type_mask: Bit mask of all hardware agents for this domain * @uncore_attr_group: Attribute group storage * @max_freq_khz_kobj_attr: Storage for kobject attribute max_freq_khz - * @mix_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz + * @min_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz * @initial_max_freq_khz_kobj_attr: Storage for kobject attribute initial_max_freq_khz * @initial_min_freq_khz_kobj_attr: Storage for kobject attribute initial_min_freq_khz * @current_freq_khz_kobj_attr: Storage for kobject attribute current_freq_khz @@ -48,13 +48,14 @@ * @fabric_cluster_id_kobj_attr: Storage for kobject attribute fabric_cluster_id * @package_id_kobj_attr: Storage for kobject attribute package_id * @elc_low_threshold_percent_kobj_attr: - Storage for kobject attribute elc_low_threshold_percent + * Storage for kobject attribute elc_low_threshold_percent * @elc_high_threshold_percent_kobj_attr: - Storage for kobject attribute elc_high_threshold_percent + * Storage for kobject attribute elc_high_threshold_percent * @elc_high_threshold_enable_kobj_attr: - Storage for kobject attribute elc_high_threshold_enable + * Storage for kobject attribute elc_high_threshold_enable * @elc_floor_freq_khz_kobj_attr: Storage for kobject attribute elc_floor_freq_khz * @agent_types_kobj_attr: Storage for kobject attribute agent_type + * @die_id_kobj_attr: Attribute storage for die_id information * @uncore_attrs: Attribute storage for group creation * * This structure is used to encapsulate all data related to uncore sysfs diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c index 2a6897035150..0dfc552b2802 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c @@ -256,6 +256,10 @@ static const struct x86_cpu_id intel_uncore_cpu_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_LUNARLAKE_M, NULL), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_cpu_ids); diff --git a/drivers/platform/x86/msi-wmi-platform.c b/drivers/platform/x86/msi-wmi-platform.c index dc5e9878cb68..e912fcc12d12 100644 --- a/drivers/platform/x86/msi-wmi-platform.c +++ b/drivers/platform/x86/msi-wmi-platform.c @@ -14,6 +14,7 @@ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/device/driver.h> +#include <linux/dmi.h> #include <linux/errno.h> #include <linux/hwmon.h> #include <linux/kernel.h> @@ -28,7 +29,7 @@ #define DRIVER_NAME "msi-wmi-platform" -#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11d1-00A0-C90629100000" +#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11D1-00A0-C90629100000" #define MSI_WMI_PLATFORM_INTERFACE_VERSION 2 @@ -448,7 +449,45 @@ static struct wmi_driver msi_wmi_platform_driver = { .probe = msi_wmi_platform_probe, .no_singleton = true, }; -module_wmi_driver(msi_wmi_platform_driver); + +/* + * MSI reused the WMI GUID from the WMI-ACPI sample code provided by Microsoft, + * so other manufacturers might use it as well for their WMI-ACPI implementations. + */ +static const struct dmi_system_id msi_wmi_platform_whitelist[] __initconst = { + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MICRO-STAR INT"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"), + }, + }, + { } +}; + +static int __init msi_wmi_platform_module_init(void) +{ + if (!dmi_check_system(msi_wmi_platform_whitelist)) { + if (!force) + return -ENODEV; + + pr_warn("Ignoring DMI whitelist\n"); + } + + return wmi_driver_register(&msi_wmi_platform_driver); +} + +static void __exit msi_wmi_platform_module_exit(void) +{ + wmi_driver_unregister(&msi_wmi_platform_driver); +} + +module_init(msi_wmi_platform_module_init); +module_exit(msi_wmi_platform_module_exit); + MODULE_AUTHOR("Armin Wolf <W_Armin@gmx.de>"); MODULE_DESCRIPTION("MSI WMI platform features"); diff --git a/drivers/pmdomain/arm/scmi_pm_domain.c b/drivers/pmdomain/arm/scmi_pm_domain.c index 8fe1c0a501c9..b5e2ffd5ea64 100644 --- a/drivers/pmdomain/arm/scmi_pm_domain.c +++ b/drivers/pmdomain/arm/scmi_pm_domain.c @@ -41,7 +41,7 @@ static int scmi_pd_power_off(struct generic_pm_domain *domain) static int scmi_pm_domain_probe(struct scmi_device *sdev) { - int num_domains, i; + int num_domains, i, ret; struct device *dev = &sdev->dev; struct device_node *np = dev->of_node; struct scmi_pm_domain *scmi_pd; @@ -108,9 +108,18 @@ static int scmi_pm_domain_probe(struct scmi_device *sdev) scmi_pd_data->domains = domains; scmi_pd_data->num_domains = num_domains; + ret = of_genpd_add_provider_onecell(np, scmi_pd_data); + if (ret) + goto err_rm_genpds; + dev_set_drvdata(dev, scmi_pd_data); - return of_genpd_add_provider_onecell(np, scmi_pd_data); + return 0; +err_rm_genpds: + for (i = num_domains - 1; i >= 0; i--) + pm_genpd_remove(domains[i]); + + return ret; } static void scmi_pm_domain_remove(struct scmi_device *sdev) diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c index 33991f3c6b55..a34b260274f7 100644 --- a/drivers/pmdomain/imx/gpc.c +++ b/drivers/pmdomain/imx/gpc.c @@ -536,6 +536,8 @@ static void imx_gpc_remove(struct platform_device *pdev) return; } } + + of_node_put(pgc_node); } static struct platform_driver imx_gpc_driver = { diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index 0ebe7379b94e..9c9323c8c93a 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -748,6 +748,18 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys) } } +static struct device_node *scpsys_get_legacy_regmap(struct device_node *np, const char *pn) +{ + struct device_node *local_node; + + for_each_child_of_node(np, local_node) { + if (of_property_present(local_node, pn)) + return local_node; + } + + return NULL; +} + static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *scpsys) { const u8 bp_blocks[3] = { @@ -769,7 +781,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s * this makes it then possible to allocate the array of bus_prot * regmaps and convert all to the new style handling. */ - node = of_find_node_with_property(np, "mediatek,infracfg"); + node = scpsys_get_legacy_regmap(np, "mediatek,infracfg"); if (node) { regmap[0] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg"); of_node_put(node); @@ -782,7 +794,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s regmap[0] = NULL; } - node = of_find_node_with_property(np, "mediatek,smi"); + node = scpsys_get_legacy_regmap(np, "mediatek,smi"); if (node) { smi_np = of_parse_phandle(node, "mediatek,smi", 0); of_node_put(node); @@ -800,7 +812,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s regmap[1] = NULL; } - node = of_find_node_with_property(np, "mediatek,infracfg-nao"); + node = scpsys_get_legacy_regmap(np, "mediatek,infracfg-nao"); if (node) { regmap[2] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); num_regmaps++; diff --git a/drivers/pmdomain/samsung/exynos-pm-domains.c b/drivers/pmdomain/samsung/exynos-pm-domains.c index 5d478bb37ad6..5c3aa8983087 100644 --- a/drivers/pmdomain/samsung/exynos-pm-domains.c +++ b/drivers/pmdomain/samsung/exynos-pm-domains.c @@ -92,13 +92,14 @@ static const struct of_device_id exynos_pm_domain_of_match[] = { { }, }; -static const char *exynos_get_domain_name(struct device_node *node) +static const char *exynos_get_domain_name(struct device *dev, + struct device_node *node) { const char *name; if (of_property_read_string(node, "label", &name) < 0) name = kbasename(node->full_name); - return kstrdup_const(name, GFP_KERNEL); + return devm_kstrdup_const(dev, name, GFP_KERNEL); } static int exynos_pd_probe(struct platform_device *pdev) @@ -115,20 +116,27 @@ static int exynos_pd_probe(struct platform_device *pdev) if (!pd) return -ENOMEM; - pd->pd.name = exynos_get_domain_name(np); + pd->pd.name = exynos_get_domain_name(dev, np); if (!pd->pd.name) return -ENOMEM; pd->base = of_iomap(np, 0); - if (!pd->base) { - kfree_const(pd->pd.name); + if (!pd->base) return -ENODEV; - } pd->pd.power_off = exynos_pd_power_off; pd->pd.power_on = exynos_pd_power_on; pd->local_pwr_cfg = pm_domain_cfg->local_pwr_cfg; + /* + * Some Samsung platforms with bootloaders turning on the splash-screen + * and handing it over to the kernel, requires the power-domains to be + * reset during boot. + */ + if (IS_ENABLED(CONFIG_ARM) && + of_device_is_compatible(np, "samsung,exynos4210-pd")) + exynos_pd_power_off(&pd->pd); + on = readl_relaxed(pd->base + 0x4) & pd->local_pwr_cfg; pm_genpd_init(&pd->pd, NULL, !on); @@ -147,15 +155,6 @@ static int exynos_pd_probe(struct platform_device *pdev) parent.np, child.np); } - /* - * Some Samsung platforms with bootloaders turning on the splash-screen - * and handing it over to the kernel, requires the power-domains to be - * reset during boot. As a temporary hack to manage this, let's enforce - * a sync_state. - */ - if (!ret) - of_genpd_sync_state(np); - pm_runtime_enable(dev); return ret; } diff --git a/drivers/pmdomain/tegra/powergate-bpmp.c b/drivers/pmdomain/tegra/powergate-bpmp.c index b0138ca9f851..9f4366250bfd 100644 --- a/drivers/pmdomain/tegra/powergate-bpmp.c +++ b/drivers/pmdomain/tegra/powergate-bpmp.c @@ -184,6 +184,7 @@ tegra_powergate_add(struct tegra_bpmp *bpmp, powergate->genpd.name = kstrdup(info->name, GFP_KERNEL); powergate->genpd.power_on = tegra_powergate_power_on; powergate->genpd.power_off = tegra_powergate_power_off; + powergate->genpd.flags = GENPD_FLAG_NO_STAY_ON; err = pm_genpd_init(&powergate->genpd, NULL, off); if (err < 0) { diff --git a/drivers/power/supply/intel_dc_ti_battery.c b/drivers/power/supply/intel_dc_ti_battery.c index 56b0c92e9d28..67a75281b0ac 100644 --- a/drivers/power/supply/intel_dc_ti_battery.c +++ b/drivers/power/supply/intel_dc_ti_battery.c @@ -127,7 +127,8 @@ struct dc_ti_battery_chip { static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr) { struct dc_ti_battery_chip *chip = power_supply_get_drvdata(psy); - s64 cnt_start_usec, now_usec, sleep_usec; + ktime_t ktime; + s64 sleep_usec; unsigned int reg_val; s32 acc, smpl_ctr; int ret; @@ -141,16 +142,17 @@ static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, i if (ret) goto out_err; - cnt_start_usec = ktime_get_ns() / NSEC_PER_USEC; + ktime = ktime_get(); /* Read Vbat, convert IIO mV to power-supply ųV */ ret = iio_read_channel_processed_scale(chip->vbat_channel, volt, 1000); if (ret < 0) goto out_err; + ktime = ktime_sub(ktime_get(), ktime); + /* Sleep at least 3 sample-times + slack to get 3+ CC samples */ - now_usec = ktime_get_ns() / NSEC_PER_USEC; - sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - (now_usec - cnt_start_usec); + sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - ktime_to_us(ktime); if (sleep_usec > 0 && sleep_usec < 1000000) usleep_range(sleep_usec, sleep_usec + SLEEP_SLACK_US); diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 8106eb617c8c..c61cf9edac48 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -561,10 +561,14 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, return ptp_mask_en_single(pccontext->private_clkdata, argptr); case PTP_SYS_OFFSET_PRECISE_CYCLES: + if (!ptp->has_cycles) + return -EOPNOTSUPP; return ptp_sys_offset_precise(ptp, argptr, ptp->info->getcrosscycles); case PTP_SYS_OFFSET_EXTENDED_CYCLES: + if (!ptp->has_cycles) + return -EOPNOTSUPP; return ptp_sys_offset_extended(ptp, argptr, ptp->info->getcyclesx64); default: diff --git a/drivers/pwm/pwm-adp5585.c b/drivers/pwm/pwm-adp5585.c index dc2860979e24..806f8d79b0d7 100644 --- a/drivers/pwm/pwm-adp5585.c +++ b/drivers/pwm/pwm-adp5585.c @@ -190,13 +190,13 @@ static int adp5585_pwm_probe(struct platform_device *pdev) return 0; } -static const struct adp5585_pwm_chip adp5589_pwm_chip_info = { +static const struct adp5585_pwm_chip adp5585_pwm_chip_info = { .pwm_cfg = ADP5585_PWM_CFG, .pwm_offt_low = ADP5585_PWM_OFFT_LOW, .pwm_ont_low = ADP5585_PWM_ONT_LOW, }; -static const struct adp5585_pwm_chip adp5585_pwm_chip_info = { +static const struct adp5585_pwm_chip adp5589_pwm_chip_info = { .pwm_cfg = ADP5589_PWM_CFG, .pwm_offt_low = ADP5589_PWM_OFFT_LOW, .pwm_ont_low = ADP5589_PWM_ONT_LOW, diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c index 4197e10993ac..0f7cd6dab0b0 100644 --- a/drivers/ras/amd/atl/core.c +++ b/drivers/ras/amd/atl/core.c @@ -194,6 +194,8 @@ MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids); static int __init amd_atl_init(void) { + int ret; + if (!x86_match_cpu(amd_atl_cpuids)) return -ENODEV; @@ -202,8 +204,9 @@ static int __init amd_atl_init(void) check_for_legacy_df_access(); - if (get_df_system_info()) - return -ENODEV; + ret = get_df_system_info(); + if (ret) + return ret; /* Increment this module's recount so that it can't be easily unloaded. */ __module_get(THIS_MODULE); diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 2b6279d32774..82a56d9c2be1 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -138,7 +138,8 @@ struct df_flags { __u8 legacy_ficaa : 1, socket_id_shift_quirk : 1, heterogeneous : 1, - __reserved_0 : 5; + prm_only : 1, + __reserved_0 : 4; }; struct df_config { @@ -283,6 +284,9 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err); u64 add_base_and_hole(struct addr_ctx *ctx, u64 addr); u64 remove_base_and_hole(struct addr_ctx *ctx, u64 addr); +/* GUIDs for PRM handlers */ +extern const guid_t norm_to_sys_guid; + #ifdef CONFIG_AMD_ATL_PRM unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr); #else diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c index 0931a20d213b..0f9bfa96e16a 100644 --- a/drivers/ras/amd/atl/prm.c +++ b/drivers/ras/amd/atl/prm.c @@ -29,10 +29,6 @@ struct norm_to_sys_param_buf { void *out_buf; } __packed; -static const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D, - 0x92, 0xCD, 0x2B, 0x56, 0xF1, - 0x2B, 0xEB, 0xA6); - unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long addr) { struct norm_to_sys_param_buf p_buf; diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index e18d916d5e8b..812a30e21d3a 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -12,6 +12,12 @@ #include "internal.h" +#include <linux/prmt.h> + +const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D, + 0x92, 0xCD, 0x2B, 0x56, 0xF1, + 0x2B, 0xEB, 0xA6); + int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id) { u16 socket_id_bits, die_id_bits; @@ -212,15 +218,17 @@ static int determine_df_rev(void) if (!rev) return determine_df_rev_legacy(); - /* - * Fail out for major revisions other than '4'. - * - * Explicit support should be added for newer systems to avoid issues. - */ if (rev == 4) return df4_determine_df_rev(reg); - return -EINVAL; + /* All other systems should have PRM handlers. */ + if (!acpi_prm_handler_available(&norm_to_sys_guid)) { + pr_debug("PRM not available\n"); + return -ENODEV; + } + + df_cfg.flags.prm_only = true; + return 0; } static int get_dram_hole_base(void) @@ -288,12 +296,18 @@ static void dump_df_cfg(void) int get_df_system_info(void) { - if (determine_df_rev()) { + int ret; + + ret = determine_df_rev(); + if (ret) { pr_warn("Failed to determine DF Revision"); df_cfg.rev = UNKNOWN; - return -EINVAL; + return ret; } + if (df_cfg.flags.prm_only) + return 0; + apply_node_id_shift(); get_num_maps(); diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 6e072b7667e9..befc616d5e8a 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -49,17 +49,6 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err) return i; } -/* XOR the bits in @val. */ -static u16 bitwise_xor_bits(u16 val) -{ - u16 tmp = 0; - u8 i; - - for (i = 0; i < 16; i++) - tmp ^= (val >> i) & 0x1; - - return tmp; -} struct xor_bits { bool xor_enable; @@ -250,17 +239,17 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) if (!addr_hash.bank[i].xor_enable) continue; - temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); - temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); + temp = hweight16(col & addr_hash.bank[i].col_xor) & 1; + temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1; bank ^= temp << i; } /* Calculate hash for PC bit. */ if (addr_hash.pc.xor_enable) { - temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); - temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); + temp = hweight16(col & addr_hash.pc.col_xor) & 1; + temp ^= hweight16(row & addr_hash.pc.row_xor) & 1; /* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */ - temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor); + temp ^= hweight16((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor) & 1; pc ^= temp; } @@ -422,7 +411,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) socket_id, die_id, coh_st_inst_id, addr); ret_addr = prm_umc_norm_to_sys_addr(socket_id, err->ipid, addr); - if (!IS_ERR_VALUE(ret_addr)) + if (!IS_ERR_VALUE(ret_addr) || df_cfg.flags.prm_only) return ret_addr; return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index e440b15fbabc..15f7f043c8ef 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -166,7 +166,7 @@ static void cec_mod_work(unsigned long interval) unsigned long iv; iv = interval * HZ; - mod_delayed_work(system_wq, &cec_work, round_jiffies(iv)); + mod_delayed_work(system_percpu_wq, &cec_work, round_jiffies(iv)); } static void cec_work_fn(struct work_struct *work) diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index 1cb647ed70c6..a2d16e9abfb5 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -334,6 +334,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) ret = dev_err_probe(&pdev->dev, PTR_ERR(drvdata->dev), "Failed to register regulator: %ld\n", PTR_ERR(drvdata->dev)); + gpiod_put(cfg.ena_gpiod); return ret; } diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c index 9cde7181b0f0..f669a562f036 100644 --- a/drivers/regulator/rtq2208-regulator.c +++ b/drivers/regulator/rtq2208-regulator.c @@ -53,7 +53,7 @@ #define RTQ2208_MASK_BUCKPH_GROUP1 GENMASK(6, 4) #define RTQ2208_MASK_BUCKPH_GROUP2 GENMASK(2, 0) #define RTQ2208_MASK_LDO2_OPT0 BIT(7) -#define RTQ2208_MASK_LDO2_OPT1 BIT(6) +#define RTQ2208_MASK_LDO2_OPT1 BIT(7) #define RTQ2208_MASK_LDO1_FIXED BIT(6) /* Size */ @@ -543,14 +543,14 @@ static int rtq2208_regulator_check(struct device *dev, int *num, int *regulator_ switch (FIELD_GET(RTQ2208_MASK_BUCKPH_GROUP2, buck_phase)) { case 2: - rtq2208_used_table[RTQ2208_BUCK_F] = true; + rtq2208_used_table[RTQ2208_BUCK_H] = true; fallthrough; case 1: rtq2208_used_table[RTQ2208_BUCK_E] = true; fallthrough; case 0: case 3: - rtq2208_used_table[RTQ2208_BUCK_H] = true; + rtq2208_used_table[RTQ2208_BUCK_F] = true; fallthrough; default: rtq2208_used_table[RTQ2208_BUCK_G] = true; diff --git a/drivers/reset/reset-imx8mp-audiomix.c b/drivers/reset/reset-imx8mp-audiomix.c index 6b357adfe646..eceb37ff5dc5 100644 --- a/drivers/reset/reset-imx8mp-audiomix.c +++ b/drivers/reset/reset-imx8mp-audiomix.c @@ -14,8 +14,8 @@ #include <linux/reset-controller.h> #define IMX8MP_AUDIOMIX_EARC_RESET_OFFSET 0x200 -#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(1) -#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(2) +#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(0) +#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(1) #define IMX8MP_AUDIOMIX_DSP_RUNSTALL_OFFSET 0x108 #define IMX8MP_AUDIOMIX_DSP_RUNSTALL_MASK BIT(5) diff --git a/drivers/rtc/rtc-cpcap.c b/drivers/rtc/rtc-cpcap.c index 8b6b35716f53..c170345ac076 100644 --- a/drivers/rtc/rtc-cpcap.c +++ b/drivers/rtc/rtc-cpcap.c @@ -268,7 +268,6 @@ static int cpcap_rtc_probe(struct platform_device *pdev) return err; rtc->alarm_irq = platform_get_irq(pdev, 0); - rtc->alarm_enabled = true; err = devm_request_threaded_irq(dev, rtc->alarm_irq, NULL, cpcap_rtc_alarm_irq, IRQF_TRIGGER_NONE | IRQF_ONESHOT, diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index aabe62c283a1..7e9f7cb90c28 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -316,7 +316,7 @@ static int rx8025_init_client(struct i2c_client *client) return hour_reg; rx8025->is_24 = (hour_reg & RX8035_BIT_HOUR_1224); } else { - rx8025->is_24 = (ctrl[1] & RX8025_BIT_CTRL1_1224); + rx8025->is_24 = (ctrl[0] & RX8025_BIT_CTRL1_1224); } out: return err; diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c index 76ecf7b798f0..54c8429b16bf 100644 --- a/drivers/rtc/rtc-tps6586x.c +++ b/drivers/rtc/rtc-tps6586x.c @@ -258,7 +258,6 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) irq_set_status_flags(rtc->irq, IRQ_NOAUTOEN); - rtc->irq_en = true; ret = devm_request_threaded_irq(&pdev->dev, rtc->irq, NULL, tps6586x_rtc_irq, IRQF_ONESHOT, diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c index 0aeafa772fb1..407b7c516658 100644 --- a/drivers/s390/net/ctcm_mpc.c +++ b/drivers/s390/net/ctcm_mpc.c @@ -701,7 +701,6 @@ static void mpc_rcvd_sweep_req(struct mpcg_info *mpcginfo) grp->sweep_req_pend_num--; ctcmpc_send_sweep_resp(ch); - kfree(mpcginfo); return; } diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4c62c597c7be..b3af9b78fa12 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2208,9 +2208,17 @@ sg_remove_sfp_usercontext(struct work_struct *work) write_lock_irqsave(&sfp->rq_list_lock, iflags); while (!list_empty(&sfp->rq_list)) { srp = list_first_entry(&sfp->rq_list, Sg_request, entry); - sg_finish_rem_req(srp); list_del(&srp->entry); + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + + sg_finish_rem_req(srp); + /* + * sg_rq_end_io() uses srp->parentfp. Hence, only clear + * srp->parentfp after blk_mq_free_request() has been called. + */ srp->parentfp = NULL; + + write_lock_irqsave(&sfp->rq_list_lock, iflags); } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c index 4fb66986cc22..cd40ab839c54 100644 --- a/drivers/slimbus/qcom-ngd-ctrl.c +++ b/drivers/slimbus/qcom-ngd-ctrl.c @@ -1241,6 +1241,7 @@ static void qcom_slim_ngd_notify_slaves(struct qcom_slim_ngd_ctrl *ctrl) if (slim_get_logical_addr(sbdev)) dev_err(ctrl->dev, "Failed to get logical address\n"); + put_device(&sbdev->dev); } } diff --git a/drivers/soc/ti/knav_dma.c b/drivers/soc/ti/knav_dma.c index a25ebe6cd503..553ae7ee20f1 100644 --- a/drivers/soc/ti/knav_dma.c +++ b/drivers/soc/ti/knav_dma.c @@ -402,7 +402,7 @@ static int of_channel_match_helper(struct device_node *np, const char *name, * @name: slave channel name * @config: dma configuration parameters * - * Returns pointer to appropriate DMA channel on success or error. + * Return: Pointer to appropriate DMA channel on success or NULL on error. */ void *knav_dma_open_channel(struct device *dev, const char *name, struct knav_dma_cfg *config) @@ -414,13 +414,13 @@ void *knav_dma_open_channel(struct device *dev, const char *name, if (!kdev) { pr_err("keystone-navigator-dma driver not registered\n"); - return (void *)-EINVAL; + return NULL; } chan_num = of_channel_match_helper(dev->of_node, name, &instance); if (chan_num < 0) { dev_err(kdev->dev, "No DMA instance with name %s\n", name); - return (void *)-EINVAL; + return NULL; } dev_dbg(kdev->dev, "initializing %s channel %d from DMA %s\n", @@ -431,7 +431,7 @@ void *knav_dma_open_channel(struct device *dev, const char *name, if (config->direction != DMA_MEM_TO_DEV && config->direction != DMA_DEV_TO_MEM) { dev_err(kdev->dev, "bad direction\n"); - return (void *)-EINVAL; + return NULL; } /* Look for correct dma instance */ @@ -443,7 +443,7 @@ void *knav_dma_open_channel(struct device *dev, const char *name, } if (!dma) { dev_err(kdev->dev, "No DMA instance with name %s\n", instance); - return (void *)-EINVAL; + return NULL; } /* Look for correct dma channel from dma instance */ @@ -463,14 +463,14 @@ void *knav_dma_open_channel(struct device *dev, const char *name, if (!chan) { dev_err(kdev->dev, "channel %d is not in DMA %s\n", chan_num, instance); - return (void *)-EINVAL; + return NULL; } if (atomic_read(&chan->ref_count) >= 1) { if (!check_config(chan, config)) { dev_err(kdev->dev, "channel %d config miss-match\n", chan_num); - return (void *)-EINVAL; + return NULL; } } diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 4d8f00c850c1..55675750182e 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -1181,10 +1181,10 @@ config SPI_TEGRA210_QUAD config SPI_TEGRA114 tristate "NVIDIA Tegra114 SPI Controller" - depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST + depends on ARCH_TEGRA || COMPILE_TEST depends on RESET_CONTROLLER help - SPI driver for NVIDIA Tegra114 SPI Controller interface. This controller + SPI controller driver for NVIDIA Tegra114 and later SoCs. This controller is different than the older SoCs SPI controller and also register interface get changed with this controller. diff --git a/drivers/spi/spi-amlogic-spifc-a1.c b/drivers/spi/spi-amlogic-spifc-a1.c index 18c9aa2cbc29..eb503790017b 100644 --- a/drivers/spi/spi-amlogic-spifc-a1.c +++ b/drivers/spi/spi-amlogic-spifc-a1.c @@ -353,7 +353,9 @@ static int amlogic_spifc_a1_probe(struct platform_device *pdev) pm_runtime_set_autosuspend_delay(spifc->dev, 500); pm_runtime_use_autosuspend(spifc->dev); - devm_pm_runtime_enable(spifc->dev); + ret = devm_pm_runtime_enable(spifc->dev); + if (ret) + return ret; ctrl->num_chipselect = 1; ctrl->dev.of_node = pdev->dev.of_node; diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c index b56210734caa..2e3c62f12bef 100644 --- a/drivers/spi/spi-bcm63xx.c +++ b/drivers/spi/spi-bcm63xx.c @@ -247,6 +247,20 @@ static int bcm63xx_txrx_bufs(struct spi_device *spi, struct spi_transfer *first, if (t->rx_buf) { do_rx = true; + + /* + * In certain hardware implementations, there appears to be a + * hidden accumulator that tracks the number of bytes written into + * the hardware FIFO, and this accumulator overrides the length in + * the SPI_MSG_CTL register. + * + * Therefore, for read-only transfers, we need to write some dummy + * value into the FIFO to keep the accumulator tracking the correct + * length. + */ + if (!t->tx_buf) + memset_io(bs->tx_io + len, 0xFF, t->len); + /* prepend is half-duplex write only */ if (t == first) prepend_len = 0; diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 81017402bc56..af6d050da1c8 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1981,6 +1981,13 @@ static int cqspi_probe(struct platform_device *pdev) cqspi->current_cs = -1; cqspi->sclk = 0; + if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { + pm_runtime_enable(dev); + pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); + pm_runtime_use_autosuspend(dev); + pm_runtime_get_noresume(dev); + } + ret = cqspi_setup_flash(cqspi); if (ret) { dev_err(dev, "failed to setup flash parameters %d\n", ret); @@ -1995,14 +2002,7 @@ static int cqspi_probe(struct platform_device *pdev) if (cqspi->use_direct_mode) { ret = cqspi_request_mmap_dma(cqspi); if (ret == -EPROBE_DEFER) - goto probe_dma_failed; - } - - if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { - pm_runtime_enable(dev); - pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT); - pm_runtime_use_autosuspend(dev); - pm_runtime_get_noresume(dev); + goto probe_setup_failed; } ret = spi_register_controller(host); @@ -2012,7 +2012,6 @@ static int cqspi_probe(struct platform_device *pdev) } if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) { - pm_runtime_put_autosuspend(dev); pm_runtime_mark_last_busy(dev); pm_runtime_put_autosuspend(dev); } @@ -2021,7 +2020,6 @@ static int cqspi_probe(struct platform_device *pdev) probe_setup_failed: if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) pm_runtime_disable(dev); -probe_dma_failed: cqspi_controller_enable(cqspi, 0); probe_reset_failed: if (cqspi->is_jh7110) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 8da66e101386..065456aba2ae 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -486,7 +486,13 @@ static int fsl_lpspi_setup_transfer(struct spi_controller *controller, fsl_lpspi->tx = fsl_lpspi_buf_tx_u32; } - fsl_lpspi->watermark = min_t(typeof(fsl_lpspi->watermark), + /* + * t->len is 'unsigned' and txfifosize and watermrk is 'u8', force + * type cast is inevitable. When len > 255, len will be truncated in min_t(), + * it caused wrong watermark set. 'unsigned int' is as the designated type + * for min_t() to avoid truncation. + */ + fsl_lpspi->watermark = min_t(unsigned int, fsl_lpspi->txfifosize, t->len); diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 155ddeb8fcd4..bbf1fd4fe1e9 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -519,9 +519,15 @@ static void mx51_ecspi_trigger(struct spi_imx_data *spi_imx) { u32 reg; - reg = readl(spi_imx->base + MX51_ECSPI_CTRL); - reg |= MX51_ECSPI_CTRL_XCH; - writel(reg, spi_imx->base + MX51_ECSPI_CTRL); + if (spi_imx->usedma) { + reg = readl(spi_imx->base + MX51_ECSPI_DMA); + reg |= MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN; + writel(reg, spi_imx->base + MX51_ECSPI_DMA); + } else { + reg = readl(spi_imx->base + MX51_ECSPI_CTRL); + reg |= MX51_ECSPI_CTRL_XCH; + writel(reg, spi_imx->base + MX51_ECSPI_CTRL); + } } static void mx51_ecspi_disable(struct spi_imx_data *spi_imx) @@ -759,7 +765,6 @@ static void mx51_setup_wml(struct spi_imx_data *spi_imx) writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml - 1) | MX51_ECSPI_DMA_TX_WML(tx_wml) | MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) | - MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN | MX51_ECSPI_DMA_RXTDEN, spi_imx->base + MX51_ECSPI_DMA); } @@ -1520,6 +1525,8 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, reinit_completion(&spi_imx->dma_tx_completion); dma_async_issue_pending(controller->dma_tx); + spi_imx->devtype_data->trigger(spi_imx); + transfer_timeout = spi_imx_calculate_timeout(spi_imx, transfer->len); /* Wait SDMA to finish the data transfer.*/ diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c index b6c79e50d842..50a7e4916a60 100644 --- a/drivers/spi/spi-nxp-fspi.c +++ b/drivers/spi/spi-nxp-fspi.c @@ -1287,7 +1287,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) { struct spi_controller *ctlr; struct device *dev = &pdev->dev; - struct device_node *np = dev->of_node; + struct fwnode_handle *fwnode = dev_fwnode(dev); struct resource *res; struct nxp_fspi *f; int ret, irq; @@ -1309,7 +1309,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) platform_set_drvdata(pdev, f); /* find the resources - configuration register address space */ - if (is_acpi_node(dev_fwnode(f->dev))) + if (is_acpi_node(fwnode)) f->iobase = devm_platform_ioremap_resource(pdev, 0); else f->iobase = devm_platform_ioremap_resource_byname(pdev, "fspi_base"); @@ -1317,7 +1317,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) return PTR_ERR(f->iobase); /* find the resources - controller memory mapped space */ - if (is_acpi_node(dev_fwnode(f->dev))) + if (is_acpi_node(fwnode)) res = platform_get_resource(pdev, IORESOURCE_MEM, 1); else res = platform_get_resource_byname(pdev, @@ -1330,7 +1330,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) f->memmap_phy_size = resource_size(res); /* find the clocks */ - if (dev_of_node(&pdev->dev)) { + if (is_of_node(fwnode)) { f->clk_en = devm_clk_get(dev, "fspi_en"); if (IS_ERR(f->clk_en)) return PTR_ERR(f->clk_en); @@ -1383,7 +1383,7 @@ static int nxp_fspi_probe(struct platform_device *pdev) else ctlr->mem_caps = &nxp_fspi_mem_caps; - ctlr->dev.of_node = np; + device_set_node(&ctlr->dev, fwnode); ret = devm_add_action_or_reset(dev, nxp_fspi_cleanup, f); if (ret) diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c index d59cc8a18484..c86dc56f38b4 100644 --- a/drivers/spi/spi-xilinx.c +++ b/drivers/spi/spi-xilinx.c @@ -300,7 +300,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) /* Read out all the data from the Rx FIFO */ rx_words = n_words; - stalled = 10; + stalled = 32; while (rx_words) { if (rx_words == n_words && !(stalled--) && !(sr & XSPI_SR_TX_EMPTY_MASK) && diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 2e0647a06890..e25df9990f82 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2851,6 +2851,18 @@ static acpi_status acpi_register_spi_device(struct spi_controller *ctlr, acpi_set_modalias(adev, acpi_device_hid(adev), spi->modalias, sizeof(spi->modalias)); + /* + * This gets re-tried in spi_probe() for -EPROBE_DEFER handling in case + * the GPIO controller does not have a driver yet. This needs to be done + * here too, because this call sets the GPIO direction and/or bias. + * Setting these needs to be done even if there is no driver, in which + * case spi_probe() will never get called. + * TODO: ideally the setup of the GPIO should be handled in a generic + * manner in the ACPI/gpiolib core code. + */ + if (spi->irq < 0) + spi->irq = acpi_dev_gpio_irq_get(adev, 0); + acpi_device_set_enumerated(adev); adev->power.flags.ignore_parent = true; diff --git a/drivers/staging/media/atomisp/i2c/atomisp-gc2235.c b/drivers/staging/media/atomisp/i2c/atomisp-gc2235.c index 6fc39ab95e46..6050637a0def 100644 --- a/drivers/staging/media/atomisp/i2c/atomisp-gc2235.c +++ b/drivers/staging/media/atomisp/i2c/atomisp-gc2235.c @@ -491,7 +491,7 @@ static int gc2235_s_power(struct v4l2_subdev *sd, int on) return ret; } -static int startup(struct v4l2_subdev *sd) +static int gc2235_startup(struct v4l2_subdev *sd) { struct gc2235_device *dev = to_gc2235_sensor(sd); struct i2c_client *client = v4l2_get_subdevdata(sd); @@ -556,7 +556,7 @@ static int gc2235_set_fmt(struct v4l2_subdev *sd, return 0; } - ret = startup(sd); + ret = gc2235_startup(sd); if (ret) { dev_err(&client->dev, "gc2235 startup err\n"); goto err; diff --git a/drivers/staging/media/atomisp/i2c/atomisp-ov2722.c b/drivers/staging/media/atomisp/i2c/atomisp-ov2722.c index c7de7800799a..a4519babf37d 100644 --- a/drivers/staging/media/atomisp/i2c/atomisp-ov2722.c +++ b/drivers/staging/media/atomisp/i2c/atomisp-ov2722.c @@ -600,7 +600,7 @@ static int ov2722_s_power(struct v4l2_subdev *sd, int on) } /* TODO: remove it. */ -static int startup(struct v4l2_subdev *sd) +static int ov2722_startup(struct v4l2_subdev *sd) { struct ov2722_device *dev = to_ov2722_sensor(sd); struct i2c_client *client = v4l2_get_subdevdata(sd); @@ -662,7 +662,7 @@ static int ov2722_set_fmt(struct v4l2_subdev *sd, dev->pixels_per_line = dev->res->pixels_per_line; dev->lines_per_frame = dev->res->lines_per_frame; - ret = startup(sd); + ret = ov2722_startup(sd); if (ret) { int i = 0; @@ -677,7 +677,7 @@ static int ov2722_set_fmt(struct v4l2_subdev *sd, dev_err(&client->dev, "power up failed, continue\n"); continue; } - ret = startup(sd); + ret = ov2722_startup(sd); if (ret) { dev_err(&client->dev, " startup FAILED!\n"); } else { diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index c7b7da629741..01a8e349dc4d 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -894,6 +894,9 @@ static ssize_t tcm_loop_tpg_address_show(struct config_item *item, struct tcm_loop_tpg, tl_se_tpg); struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; + if (!tl_hba->sh) + return -ENODEV; + return snprintf(page, PAGE_SIZE, "%d:0:%d\n", tl_hba->sh->host_no, tl_tpg->tl_tpgt); } diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index b19acd662726..9e51c535ba8c 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3670,8 +3670,6 @@ static int __init target_core_init_configfs(void) { struct configfs_subsystem *subsys = &target_core_fabrics; struct t10_alua_lu_gp *lu_gp; - struct cred *kern_cred; - const struct cred *old_cred; int ret; pr_debug("TARGET_CORE[0]: Loading Generic Kernel Storage" @@ -3748,16 +3746,8 @@ static int __init target_core_init_configfs(void) if (ret < 0) goto out; - /* We use the kernel credentials to access the target directory */ - kern_cred = prepare_kernel_cred(&init_task); - if (!kern_cred) { - ret = -ENOMEM; - goto out; - } - old_cred = override_creds(kern_cred); - target_init_dbroot(); - revert_creds(old_cred); - put_cred(kern_cred); + scoped_with_kernel_creds() + target_init_dbroot(); return 0; diff --git a/drivers/tee/qcomtee/call.c b/drivers/tee/qcomtee/call.c index ac134452cc9c..65f9140d4e1f 100644 --- a/drivers/tee/qcomtee/call.c +++ b/drivers/tee/qcomtee/call.c @@ -645,7 +645,7 @@ static void qcomtee_get_version(struct tee_device *teedev, static void qcomtee_get_qtee_feature_list(struct tee_context *ctx, u32 id, u32 *version) { - struct qcomtee_object_invoke_ctx *oic __free(kfree); + struct qcomtee_object_invoke_ctx *oic __free(kfree) = NULL; struct qcomtee_object *client_env, *service; struct qcomtee_arg u[3] = { 0 }; int result; diff --git a/drivers/tee/qcomtee/core.c b/drivers/tee/qcomtee/core.c index b6715ada7700..ecd04403591c 100644 --- a/drivers/tee/qcomtee/core.c +++ b/drivers/tee/qcomtee/core.c @@ -82,7 +82,7 @@ static void qcomtee_do_release_qtee_object(struct work_struct *work) { struct qcomtee_object *object; struct qcomtee *qcomtee; - int ret, result; + int ret, result = 0; /* RELEASE does not require any argument. */ struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 5f63f9b9cf40..addb4a20d5ea 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1538,6 +1538,8 @@ static struct pci_device_id nhi_ids[] = { .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI1), .driver_data = (kernel_ulong_t)&icl_nhi_ops }, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_WCL_NHI0), + .driver_data = (kernel_ulong_t)&icl_nhi_ops }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI) }, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI) }, diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h index 16744f25a9a0..24ac4246d0ca 100644 --- a/drivers/thunderbolt/nhi.h +++ b/drivers/thunderbolt/nhi.h @@ -75,6 +75,7 @@ extern const struct tb_nhi_ops icl_nhi_ops; #define PCI_DEVICE_ID_INTEL_TITAN_RIDGE_DD_BRIDGE 0x15ef #define PCI_DEVICE_ID_INTEL_ADL_NHI0 0x463e #define PCI_DEVICE_ID_INTEL_ADL_NHI1 0x466d +#define PCI_DEVICE_ID_INTEL_WCL_NHI0 0x4d33 #define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI 0x5781 #define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI 0x5784 #define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HUB_80G_BRIDGE 0x5786 diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c index 5af46442a792..81eaca751541 100644 --- a/drivers/tty/amiserial.c +++ b/drivers/tty/amiserial.c @@ -438,7 +438,7 @@ static irqreturn_t ser_tx_int(int irq, void *dev_id) * --------------------------------------------------------------- */ -static int startup(struct tty_struct *tty, struct serial_state *info) +static int rs_startup(struct tty_struct *tty, struct serial_state *info) { struct tty_port *port = &info->tport; unsigned long flags; @@ -513,7 +513,7 @@ errout: * This routine will shutdown a serial port; interrupts are disabled, and * DTR is dropped if the hangup on close termio flag is on. */ -static void shutdown(struct tty_struct *tty, struct serial_state *info) +static void rs_shutdown(struct tty_struct *tty, struct serial_state *info) { unsigned long flags; @@ -975,7 +975,7 @@ check_and_exit: change_speed(tty, state, NULL); } } else - retval = startup(tty, state); + retval = rs_startup(tty, state); tty_unlock(tty); return retval; } @@ -1251,9 +1251,9 @@ static void rs_close(struct tty_struct *tty, struct file * filp) */ rs_wait_until_sent(tty, state->timeout); } - shutdown(tty, state); + rs_shutdown(tty, state); rs_flush_buffer(tty); - + tty_ldisc_flush(tty); port->tty = NULL; @@ -1325,7 +1325,7 @@ static void rs_hangup(struct tty_struct *tty) struct serial_state *info = tty->driver_data; rs_flush_buffer(tty); - shutdown(tty, info); + rs_shutdown(tty, info); info->tport.count = 0; tty_port_set_active(&info->tport, false); info->tport.tty = NULL; @@ -1349,7 +1349,7 @@ static int rs_open(struct tty_struct *tty, struct file * filp) port->tty = tty; tty->driver_data = info; - retval = startup(tty, info); + retval = rs_startup(tty, info); if (retval) { return retval; } diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 8bb1a01fef2a..41c1d909525c 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -589,6 +589,23 @@ static inline void legacy_pty_init(void) { } #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; +static struct file *ptm_open_peer_file(struct file *master, + struct tty_struct *tty, int flags) +{ + struct path path; + struct file *file; + + /* Compute the slave's path */ + path.mnt = devpts_mntget(master, tty->driver_data); + if (IS_ERR(path.mnt)) + return ERR_CAST(path.mnt); + path.dentry = tty->link->driver_data; + + file = dentry_open(&path, flags, current_cred()); + mntput(path.mnt); + return file; +} + /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node @@ -601,42 +618,10 @@ static struct cdev ptmx_cdev; */ int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { - int fd; - struct file *filp; - int retval = -EINVAL; - struct path path; - if (tty->driver != ptm_driver) return -EIO; - fd = get_unused_fd_flags(flags); - if (fd < 0) { - retval = fd; - goto err; - } - - /* Compute the slave's path */ - path.mnt = devpts_mntget(master, tty->driver_data); - if (IS_ERR(path.mnt)) { - retval = PTR_ERR(path.mnt); - goto err_put; - } - path.dentry = tty->link->driver_data; - - filp = dentry_open(&path, flags, current_cred()); - mntput(path.mnt); - if (IS_ERR(filp)) { - retval = PTR_ERR(filp); - goto err_put; - } - - fd_install(fd, filp); - return fd; - -err_put: - put_unused_fd(fd); -err: - return retval; + return FD_ADD(flags, ptm_open_peer_file(master, tty, flags)); } static int pty_unix98_ioctl(struct tty_struct *tty, diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h index 58e64c4e1e3a..e99f5193d8f1 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -322,13 +322,13 @@ static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_RSA -void univ8250_rsa_support(struct uart_ops *ops); +void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops); void rsa_enable(struct uart_8250_port *up); void rsa_disable(struct uart_8250_port *up); void rsa_autoconfig(struct uart_8250_port *up); void rsa_reset(struct uart_8250_port *up); #else -static inline void univ8250_rsa_support(struct uart_ops *ops) { } +static inline void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops) { } static inline void rsa_enable(struct uart_8250_port *up) {} static inline void rsa_disable(struct uart_8250_port *up) {} static inline void rsa_autoconfig(struct uart_8250_port *up) {} diff --git a/drivers/tty/serial/8250/8250_platform.c b/drivers/tty/serial/8250/8250_platform.c index b27981340e76..fe7ec440ffa5 100644 --- a/drivers/tty/serial/8250/8250_platform.c +++ b/drivers/tty/serial/8250/8250_platform.c @@ -75,7 +75,7 @@ static void __init __serial8250_isa_init_ports(void) /* chain base port ops to support Remote Supervisor Adapter */ univ8250_port_ops = *univ8250_port_base_ops; - univ8250_rsa_support(&univ8250_port_ops); + univ8250_rsa_support(&univ8250_port_ops, univ8250_port_base_ops); if (share_irqs) irqflag = IRQF_SHARED; diff --git a/drivers/tty/serial/8250/8250_rsa.c b/drivers/tty/serial/8250/8250_rsa.c index 40a3dbd9e452..1f182f165525 100644 --- a/drivers/tty/serial/8250/8250_rsa.c +++ b/drivers/tty/serial/8250/8250_rsa.c @@ -14,6 +14,8 @@ static unsigned long probe_rsa[PORT_RSA_MAX]; static unsigned int probe_rsa_count; +static const struct uart_ops *core_port_base_ops; + static int rsa8250_request_resource(struct uart_8250_port *up) { struct uart_port *port = &up->port; @@ -67,7 +69,7 @@ static void univ8250_config_port(struct uart_port *port, int flags) } } - univ8250_port_base_ops->config_port(port, flags); + core_port_base_ops->config_port(port, flags); if (port->type != PORT_RSA && up->probe & UART_PROBE_RSA) rsa8250_release_resource(up); @@ -78,11 +80,11 @@ static int univ8250_request_port(struct uart_port *port) struct uart_8250_port *up = up_to_u8250p(port); int ret; - ret = univ8250_port_base_ops->request_port(port); + ret = core_port_base_ops->request_port(port); if (ret == 0 && port->type == PORT_RSA) { ret = rsa8250_request_resource(up); if (ret < 0) - univ8250_port_base_ops->release_port(port); + core_port_base_ops->release_port(port); } return ret; @@ -94,15 +96,25 @@ static void univ8250_release_port(struct uart_port *port) if (port->type == PORT_RSA) rsa8250_release_resource(up); - univ8250_port_base_ops->release_port(port); + core_port_base_ops->release_port(port); } -void univ8250_rsa_support(struct uart_ops *ops) +/* + * It is not allowed to directly reference any symbols from 8250.ko here as + * that would result in a dependency loop between the 8250.ko and + * 8250_base.ko modules. This function is called from 8250.ko and is used to + * break the symbolic dependency cycle. Anything that is needed from 8250.ko + * has to be passed as pointers to this function which then can adjust those + * variables on 8250.ko side or store them locally as needed. + */ +void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops) { + core_port_base_ops = core_ops; ops->config_port = univ8250_config_port; ops->request_port = univ8250_request_port; ops->release_port = univ8250_release_port; } +EXPORT_SYMBOL_FOR_MODULES(univ8250_rsa_support, "8250"); module_param_hw_array(probe_rsa, ulong, ioport, &probe_rsa_count, 0444); MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA"); @@ -146,7 +158,6 @@ void rsa_enable(struct uart_8250_port *up) if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) serial_out(up, UART_RSA_FRR, 0); } -EXPORT_SYMBOL_FOR_MODULES(rsa_enable, "8250_base"); /* * Attempts to turn off the RSA FIFO and resets the RSA board back to 115kbps compat mode. It is @@ -178,7 +189,6 @@ void rsa_disable(struct uart_8250_port *up) if (result) up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; } -EXPORT_SYMBOL_FOR_MODULES(rsa_disable, "8250_base"); void rsa_autoconfig(struct uart_8250_port *up) { @@ -191,7 +201,6 @@ void rsa_autoconfig(struct uart_8250_port *up) if (__rsa_enable(up)) up->port.type = PORT_RSA; } -EXPORT_SYMBOL_FOR_MODULES(rsa_autoconfig, "8250_base"); void rsa_reset(struct uart_8250_port *up) { @@ -200,7 +209,6 @@ void rsa_reset(struct uart_8250_port *up) serial_out(up, UART_RSA_FRR, 0); } -EXPORT_SYMBOL_FOR_MODULES(rsa_reset, "8250_base"); #ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS #ifndef MODULE diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile index 513a0941c284..9ec4d5fe64de 100644 --- a/drivers/tty/serial/8250/Makefile +++ b/drivers/tty/serial/8250/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_SERIAL_8250) += 8250.o 8250-y := 8250_core.o 8250-y += 8250_platform.o 8250-$(CONFIG_SERIAL_8250_PNP) += 8250_pnp.o -8250-$(CONFIG_SERIAL_8250_RSA) += 8250_rsa.o obj-$(CONFIG_SERIAL_8250) += 8250_base.o 8250_base-y := 8250_port.o @@ -15,6 +14,7 @@ obj-$(CONFIG_SERIAL_8250) += 8250_base.o 8250_base-$(CONFIG_SERIAL_8250_DWLIB) += 8250_dwlib.o 8250_base-$(CONFIG_SERIAL_8250_FINTEK) += 8250_fintek.o 8250_base-$(CONFIG_SERIAL_8250_PCILIB) += 8250_pcilib.o +8250_base-$(CONFIG_SERIAL_8250_RSA) += 8250_rsa.o obj-$(CONFIG_SERIAL_8250_CONSOLE) += 8250_early.o diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 22939841b1de..7f17d288c807 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -628,7 +628,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap) dmatx->len = count; dmatx->dma = dma_map_single(dma_dev->dev, dmatx->buf, count, DMA_TO_DEVICE); - if (dmatx->dma == DMA_MAPPING_ERROR) { + if (dma_mapping_error(dma_dev->dev, dmatx->dma)) { uap->dmatx.queued = false; dev_dbg(uap->port.dev, "unable to map TX DMA\n"); return -EBUSY; diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c index 7fb995a8490e..d00903cfa841 100644 --- a/drivers/tty/serial/icom.c +++ b/drivers/tty/serial/icom.c @@ -760,7 +760,7 @@ static void load_code(struct icom_port *icom_port) dma_free_coherent(&dev->dev, 4096, new_page, temp_pci); } -static int startup(struct icom_port *icom_port) +static int icom_startup(struct icom_port *icom_port) { unsigned long temp; unsigned char cable_id, raw_cable_id; @@ -832,7 +832,7 @@ unlock: return 0; } -static void shutdown(struct icom_port *icom_port) +static void icom_shutdown(struct icom_port *icom_port) { unsigned long temp; unsigned char cmdReg; @@ -1311,7 +1311,7 @@ static int icom_open(struct uart_port *port) int retval; kref_get(&icom_port->adapter->kref); - retval = startup(icom_port); + retval = icom_startup(icom_port); if (retval) { kref_put(&icom_port->adapter->kref, icom_kref_release); @@ -1333,7 +1333,7 @@ static void icom_close(struct uart_port *port) cmdReg = readb(&icom_port->dram->CmdReg); writeb(cmdReg & ~CMD_RCV_ENABLE, &icom_port->dram->CmdReg); - shutdown(icom_port); + icom_shutdown(icom_port); kref_put(&icom_port->adapter->kref, icom_kref_release); } diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 3865b10d2d43..9d591fb291fd 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -407,9 +407,9 @@ static void wr_reg32(struct slgt_info *info, unsigned int addr, __u32 value); static void msc_set_vcr(struct slgt_info *info); -static int startup(struct slgt_info *info); +static int startup_hw(struct slgt_info *info); static int block_til_ready(struct tty_struct *tty, struct file * filp,struct slgt_info *info); -static void shutdown(struct slgt_info *info); +static void shutdown_hw(struct slgt_info *info); static void program_hw(struct slgt_info *info); static void change_params(struct slgt_info *info); @@ -622,7 +622,7 @@ static int open(struct tty_struct *tty, struct file *filp) if (info->port.count == 1) { /* 1st open on this device, init hardware */ - retval = startup(info); + retval = startup_hw(info); if (retval < 0) { mutex_unlock(&info->port.mutex); goto cleanup; @@ -666,7 +666,7 @@ static void close(struct tty_struct *tty, struct file *filp) flush_buffer(tty); tty_ldisc_flush(tty); - shutdown(info); + shutdown_hw(info); mutex_unlock(&info->port.mutex); tty_port_close_end(&info->port, tty); @@ -687,7 +687,7 @@ static void hangup(struct tty_struct *tty) flush_buffer(tty); mutex_lock(&info->port.mutex); - shutdown(info); + shutdown_hw(info); spin_lock_irqsave(&info->port.lock, flags); info->port.count = 0; @@ -1445,7 +1445,7 @@ static int hdlcdev_open(struct net_device *dev) spin_unlock_irqrestore(&info->netlock, flags); /* claim resources and init adapter */ - if ((rc = startup(info)) != 0) { + if ((rc = startup_hw(info)) != 0) { spin_lock_irqsave(&info->netlock, flags); info->netcount=0; spin_unlock_irqrestore(&info->netlock, flags); @@ -1455,7 +1455,7 @@ static int hdlcdev_open(struct net_device *dev) /* generic HDLC layer open processing */ rc = hdlc_open(dev); if (rc) { - shutdown(info); + shutdown_hw(info); spin_lock_irqsave(&info->netlock, flags); info->netcount = 0; spin_unlock_irqrestore(&info->netlock, flags); @@ -1499,7 +1499,7 @@ static int hdlcdev_close(struct net_device *dev) netif_stop_queue(dev); /* shutdown adapter and release resources */ - shutdown(info); + shutdown_hw(info); hdlc_close(dev); @@ -2328,7 +2328,7 @@ static irqreturn_t slgt_interrupt(int dummy, void *dev_id) return IRQ_HANDLED; } -static int startup(struct slgt_info *info) +static int startup_hw(struct slgt_info *info) { DBGINFO(("%s startup\n", info->device_name)); @@ -2361,7 +2361,7 @@ static int startup(struct slgt_info *info) /* * called by close() and hangup() to shutdown hardware */ -static void shutdown(struct slgt_info *info) +static void shutdown_hw(struct slgt_info *info) { unsigned long flags; diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c index c040afc6668e..0086816b27cd 100644 --- a/drivers/ufs/core/ufs-sysfs.c +++ b/drivers/ufs/core/ufs-sysfs.c @@ -1949,7 +1949,7 @@ static umode_t ufs_sysfs_hid_is_visible(struct kobject *kobj, return hba->dev_info.hid_sup ? attr->mode : 0; } -const struct attribute_group ufs_sysfs_hid_group = { +static const struct attribute_group ufs_sysfs_hid_group = { .name = "hid", .attrs = ufs_sysfs_hid, .is_visible = ufs_sysfs_hid_is_visible, diff --git a/drivers/ufs/core/ufs-sysfs.h b/drivers/ufs/core/ufs-sysfs.h index 6efb82a082fd..8d94af3b8077 100644 --- a/drivers/ufs/core/ufs-sysfs.h +++ b/drivers/ufs/core/ufs-sysfs.h @@ -14,6 +14,5 @@ void ufs_sysfs_remove_nodes(struct device *dev); extern const struct attribute_group ufs_sysfs_unit_descriptor_group; extern const struct attribute_group ufs_sysfs_lun_attributes_group; -extern const struct attribute_group ufs_sysfs_hid_group; #endif diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 9ca27de4767a..d6a060a72461 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5066,7 +5066,8 @@ static int ufshcd_link_startup(struct ufs_hba *hba) * If UFS device isn't active then we will have to issue link startup * 2 times to make sure the device state move to active. */ - if (!ufshcd_is_ufs_dev_active(hba)) + if (!(hba->quirks & UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE) && + !ufshcd_is_ufs_dev_active(hba)) link_startup_again = true; link_startup: @@ -5131,12 +5132,8 @@ link_startup: ufshcd_readl(hba, REG_UIC_ERROR_CODE_PHY_ADAPTER_LAYER); ret = ufshcd_make_hba_operational(hba); out: - if (ret) { + if (ret) dev_err(hba->dev, "link startup failed %d\n", ret); - ufshcd_print_host_state(hba); - ufshcd_print_pwr_info(hba); - ufshcd_print_evt_hist(hba); - } return ret; } @@ -8503,8 +8500,6 @@ static int ufs_get_device_desc(struct ufs_hba *hba) DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP) & UFS_DEV_HID_SUPPORT; - sysfs_update_group(&hba->dev->kobj, &ufs_sysfs_hid_group); - model_index = desc_buf[DEVICE_DESC_PARAM_PRDCT_NAME]; err = ufshcd_read_string_desc(hba, model_index, @@ -10661,7 +10656,7 @@ remove_scsi_host: * @mmio_base: base register address * @irq: Interrupt line of device * - * Return: 0 on success, non-zero value on failure. + * Return: 0 on success; < 0 on failure. */ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) { @@ -10891,8 +10886,8 @@ initialized: if (err) goto out_disable; - async_schedule(ufshcd_async_scan, hba); ufs_sysfs_add_nodes(hba->dev); + async_schedule(ufshcd_async_scan, hba); device_enable_async_suspend(dev); ufshcd_pm_qos_init(hba); @@ -10902,7 +10897,7 @@ out_disable: hba->is_irq_enabled = false; ufshcd_hba_exit(hba); out_error: - return err; + return err > 0 ? -EIO : err; } EXPORT_SYMBOL_GPL(ufshcd_init); diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 3e83dc51d538..eba0e6617483 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -740,8 +740,21 @@ static int ufs_qcom_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op, /* reset the connected UFS device during power down */ - if (ufs_qcom_is_link_off(hba) && host->device_reset) + if (ufs_qcom_is_link_off(hba) && host->device_reset) { ufs_qcom_device_reset_ctrl(hba, true); + /* + * After sending the SSU command, asserting the rst_n + * line causes the device firmware to wake up and + * execute its reset routine. + * + * During this process, the device may draw current + * beyond the permissible limit for low-power mode (LPM). + * A 10ms delay, based on experimental observations, + * allows the UFS device to complete its hardware reset + * before transitioning the power rail to LPM. + */ + usleep_range(10000, 11000); + } return ufs_qcom_ice_suspend(host); } diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c index b87e03777395..5f65dfad1a71 100644 --- a/drivers/ufs/host/ufshcd-pci.c +++ b/drivers/ufs/host/ufshcd-pci.c @@ -15,6 +15,7 @@ #include <linux/pci.h> #include <linux/pm_runtime.h> #include <linux/pm_qos.h> +#include <linux/suspend.h> #include <linux/debugfs.h> #include <linux/uuid.h> #include <linux/acpi.h> @@ -31,6 +32,7 @@ struct intel_host { u32 dsm_fns; u32 active_ltr; u32 idle_ltr; + int saved_spm_lvl; struct dentry *debugfs_root; struct gpio_desc *reset_gpio; }; @@ -347,6 +349,7 @@ static int ufs_intel_common_init(struct ufs_hba *hba) host = devm_kzalloc(hba->dev, sizeof(*host), GFP_KERNEL); if (!host) return -ENOMEM; + host->saved_spm_lvl = -1; ufshcd_set_variant(hba, host); intel_dsm_init(host, hba->dev); if (INTEL_DSM_SUPPORTED(host, RESET)) { @@ -425,7 +428,8 @@ static int ufs_intel_lkf_init(struct ufs_hba *hba) static int ufs_intel_adl_init(struct ufs_hba *hba) { hba->nop_out_timeout = 200; - hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8; + hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8 | + UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE; hba->caps |= UFSHCD_CAP_WB_EN; return ufs_intel_common_init(hba); } @@ -538,6 +542,66 @@ static int ufshcd_pci_restore(struct device *dev) return ufshcd_system_resume(dev); } + +static int ufs_intel_suspend_prepare(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + struct intel_host *host = ufshcd_get_variant(hba); + int err; + + /* + * Only s2idle (S0ix) retains link state. Force power-off + * (UFS_PM_LVL_5) for any other case. + */ + if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE && hba->spm_lvl < UFS_PM_LVL_5) { + host->saved_spm_lvl = hba->spm_lvl; + hba->spm_lvl = UFS_PM_LVL_5; + } + + err = ufshcd_suspend_prepare(dev); + + if (err < 0 && host->saved_spm_lvl != -1) { + hba->spm_lvl = host->saved_spm_lvl; + host->saved_spm_lvl = -1; + } + + return err; +} + +static void ufs_intel_resume_complete(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + struct intel_host *host = ufshcd_get_variant(hba); + + ufshcd_resume_complete(dev); + + if (host->saved_spm_lvl != -1) { + hba->spm_lvl = host->saved_spm_lvl; + host->saved_spm_lvl = -1; + } +} + +static int ufshcd_pci_suspend_prepare(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + if (!strcmp(hba->vops->name, "intel-pci")) + return ufs_intel_suspend_prepare(dev); + + return ufshcd_suspend_prepare(dev); +} + +static void ufshcd_pci_resume_complete(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + + if (!strcmp(hba->vops->name, "intel-pci")) { + ufs_intel_resume_complete(dev); + return; + } + + ufshcd_resume_complete(dev); +} #endif /** @@ -611,8 +675,8 @@ static const struct dev_pm_ops ufshcd_pci_pm_ops = { .thaw = ufshcd_system_resume, .poweroff = ufshcd_system_suspend, .restore = ufshcd_pci_restore, - .prepare = ufshcd_suspend_prepare, - .complete = ufshcd_resume_complete, + .prepare = ufshcd_pci_suspend_prepare, + .complete = ufshcd_pci_resume_complete, #endif }; diff --git a/drivers/usb/cdns3/cdns3-pci-wrap.c b/drivers/usb/cdns3/cdns3-pci-wrap.c index 3b3b3dc75f35..57f57c24c663 100644 --- a/drivers/usb/cdns3/cdns3-pci-wrap.c +++ b/drivers/usb/cdns3/cdns3-pci-wrap.c @@ -98,10 +98,8 @@ static int cdns3_pci_probe(struct pci_dev *pdev, wrap = pci_get_drvdata(func); } else { wrap = kzalloc(sizeof(*wrap), GFP_KERNEL); - if (!wrap) { - pci_disable_device(pdev); + if (!wrap) return -ENOMEM; - } } res = wrap->dev_res; @@ -160,7 +158,6 @@ static int cdns3_pci_probe(struct pci_dev *pdev, /* register platform device */ wrap->plat_dev = platform_device_register_full(&plat_info); if (IS_ERR(wrap->plat_dev)) { - pci_disable_device(pdev); err = PTR_ERR(wrap->plat_dev); kfree(wrap); return err; diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index ae140c356295..c2ce2f5e60a1 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -25,6 +25,7 @@ #include <linux/of.h> #include <linux/of_graph.h> #include <linux/acpi.h> +#include <linux/pci.h> #include <linux/pinctrl/consumer.h> #include <linux/pinctrl/devinfo.h> #include <linux/reset.h> @@ -2241,7 +2242,7 @@ int dwc3_core_probe(const struct dwc3_probe_data *data) dev_set_drvdata(dev, dwc); dwc3_cache_hwparams(dwc); - if (!dwc->sysdev_is_parent && + if (!dev_is_pci(dwc->sysdev) && DWC3_GHWPARAMS0_AWIDTH(dwc->hwparams.hwparams0) == 64) { ret = dma_set_mask_and_coherent(dwc->sysdev, DMA_BIT_MASK(64)); if (ret) diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index 39c72cb52ce7..8f5faf632a8b 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -21,40 +21,41 @@ #include <linux/acpi.h> #include <linux/delay.h> +#define PCI_DEVICE_ID_INTEL_CMLLP 0x02ee +#define PCI_DEVICE_ID_INTEL_CMLH 0x06ee +#define PCI_DEVICE_ID_INTEL_BXT 0x0aaa #define PCI_DEVICE_ID_INTEL_BYT 0x0f37 #define PCI_DEVICE_ID_INTEL_MRFLD 0x119e -#define PCI_DEVICE_ID_INTEL_BSW 0x22b7 -#define PCI_DEVICE_ID_INTEL_SPTLP 0x9d30 -#define PCI_DEVICE_ID_INTEL_SPTH 0xa130 -#define PCI_DEVICE_ID_INTEL_BXT 0x0aaa #define PCI_DEVICE_ID_INTEL_BXT_M 0x1aaa -#define PCI_DEVICE_ID_INTEL_APL 0x5aaa -#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0 -#define PCI_DEVICE_ID_INTEL_CMLLP 0x02ee -#define PCI_DEVICE_ID_INTEL_CMLH 0x06ee +#define PCI_DEVICE_ID_INTEL_BSW 0x22b7 #define PCI_DEVICE_ID_INTEL_GLK 0x31aa -#define PCI_DEVICE_ID_INTEL_CNPLP 0x9dee -#define PCI_DEVICE_ID_INTEL_CNPH 0xa36e -#define PCI_DEVICE_ID_INTEL_CNPV 0xa3b0 #define PCI_DEVICE_ID_INTEL_ICLLP 0x34ee -#define PCI_DEVICE_ID_INTEL_EHL 0x4b7e -#define PCI_DEVICE_ID_INTEL_TGPLP 0xa0ee #define PCI_DEVICE_ID_INTEL_TGPH 0x43ee -#define PCI_DEVICE_ID_INTEL_JSP 0x4dee -#define PCI_DEVICE_ID_INTEL_WCL 0x4d7e #define PCI_DEVICE_ID_INTEL_ADL 0x460e -#define PCI_DEVICE_ID_INTEL_ADL_PCH 0x51ee #define PCI_DEVICE_ID_INTEL_ADLN 0x465e +#define PCI_DEVICE_ID_INTEL_EHL 0x4b7e +#define PCI_DEVICE_ID_INTEL_WCL 0x4d7e +#define PCI_DEVICE_ID_INTEL_JSP 0x4dee +#define PCI_DEVICE_ID_INTEL_ADL_PCH 0x51ee #define PCI_DEVICE_ID_INTEL_ADLN_PCH 0x54ee -#define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1 -#define PCI_DEVICE_ID_INTEL_RPL 0xa70e +#define PCI_DEVICE_ID_INTEL_APL 0x5aaa +#define PCI_DEVICE_ID_INTEL_NVLS_PCH 0x6e6f +#define PCI_DEVICE_ID_INTEL_ARLH_PCH 0x777e #define PCI_DEVICE_ID_INTEL_RPLS 0x7a61 +#define PCI_DEVICE_ID_INTEL_MTL 0x7e7e +#define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1 #define PCI_DEVICE_ID_INTEL_MTLM 0x7eb1 #define PCI_DEVICE_ID_INTEL_MTLP 0x7ec1 #define PCI_DEVICE_ID_INTEL_MTLS 0x7f6f -#define PCI_DEVICE_ID_INTEL_MTL 0x7e7e -#define PCI_DEVICE_ID_INTEL_ARLH_PCH 0x777e #define PCI_DEVICE_ID_INTEL_TGL 0x9a15 +#define PCI_DEVICE_ID_INTEL_SPTLP 0x9d30 +#define PCI_DEVICE_ID_INTEL_CNPLP 0x9dee +#define PCI_DEVICE_ID_INTEL_TGPLP 0xa0ee +#define PCI_DEVICE_ID_INTEL_SPTH 0xa130 +#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0 +#define PCI_DEVICE_ID_INTEL_CNPH 0xa36e +#define PCI_DEVICE_ID_INTEL_CNPV 0xa3b0 +#define PCI_DEVICE_ID_INTEL_RPL 0xa70e #define PCI_DEVICE_ID_INTEL_PTLH 0xe332 #define PCI_DEVICE_ID_INTEL_PTLH_PCH 0xe37e #define PCI_DEVICE_ID_INTEL_PTLU 0xe432 @@ -412,40 +413,41 @@ static void dwc3_pci_remove(struct pci_dev *pci) } static const struct pci_device_id dwc3_pci_id_table[] = { - { PCI_DEVICE_DATA(INTEL, BSW, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, BYT, &dwc3_pci_intel_byt_swnode) }, - { PCI_DEVICE_DATA(INTEL, MRFLD, &dwc3_pci_intel_mrfld_swnode) }, { PCI_DEVICE_DATA(INTEL, CMLLP, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, CMLH, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, SPTLP, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, SPTH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, BXT, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, BYT, &dwc3_pci_intel_byt_swnode) }, + { PCI_DEVICE_DATA(INTEL, MRFLD, &dwc3_pci_intel_mrfld_swnode) }, { PCI_DEVICE_DATA(INTEL, BXT_M, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, APL, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, KBP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, BSW, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, GLK, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, CNPLP, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, CNPH, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, CNPV, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, ICLLP, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, EHL, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, TGPLP, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, TGPH, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, JSP, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, WCL, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, ADL, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, ADL_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, ADLN, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, EHL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, WCL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, JSP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, ADL_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, ADLN_PCH, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, ADLS, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, RPL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, APL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, NVLS_PCH, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, ARLH_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, RPLS, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, MTL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, ADLS, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, MTLM, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, MTLP, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, MTL, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, MTLS, &dwc3_pci_intel_swnode) }, - { PCI_DEVICE_DATA(INTEL, ARLH_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, TGL, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, SPTLP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, CNPLP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, TGPLP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, SPTH, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, KBP, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, CNPH, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, CNPV, &dwc3_pci_intel_swnode) }, + { PCI_DEVICE_DATA(INTEL, RPL, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, PTLH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, PTLH_PCH, &dwc3_pci_intel_swnode) }, { PCI_DEVICE_DATA(INTEL, PTLU, &dwc3_pci_intel_swnode) }, diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index b4229aa13f37..e0bad5708664 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -94,6 +94,7 @@ static int __dwc3_gadget_ep0_queue(struct dwc3_ep *dep, req->request.actual = 0; req->request.status = -EINPROGRESS; req->epnum = dep->number; + req->status = DWC3_REQUEST_STATUS_QUEUED; list_add_tail(&req->list, &dep->pending_list); diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6f18b4840a25..5e4997f974dd 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -228,6 +228,13 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req, { struct dwc3 *dwc = dep->dwc; + /* + * The request might have been processed and completed while the + * spinlock was released. Skip processing if already completed. + */ + if (req->status == DWC3_REQUEST_STATUS_COMPLETED) + return; + dwc3_gadget_del_and_unmap_request(dep, req, status); req->status = DWC3_REQUEST_STATUS_COMPLETED; diff --git a/drivers/usb/gadget/function/f_eem.c b/drivers/usb/gadget/function/f_eem.c index 6de81ea17274..edbbadad6138 100644 --- a/drivers/usb/gadget/function/f_eem.c +++ b/drivers/usb/gadget/function/f_eem.c @@ -477,8 +477,13 @@ static int eem_unwrap(struct gether *port, req->complete = eem_cmd_complete; req->zero = 1; req->context = ctx; - if (usb_ep_queue(port->in_ep, req, GFP_ATOMIC)) + if (usb_ep_queue(port->in_ep, req, GFP_ATOMIC)) { DBG(cdev, "echo response queue fail\n"); + kfree(ctx); + kfree(req->buf); + usb_ep_free_request(ep, req); + dev_kfree_skb_any(skb2); + } break; case 1: /* echo response */ diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 694653761c44..8dbe79bdc0f9 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work) void usb_gadget_set_state(struct usb_gadget *gadget, enum usb_device_state state) { + unsigned long flags; + + spin_lock_irqsave(&gadget->state_lock, flags); gadget->state = state; - schedule_work(&gadget->work); + if (!gadget->teardown) + schedule_work(&gadget->work); + spin_unlock_irqrestore(&gadget->state_lock, flags); trace_usb_gadget_set_state(gadget, 0); } EXPORT_SYMBOL_GPL(usb_gadget_set_state); @@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev) void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { + spin_lock_init(&gadget->state_lock); + gadget->teardown = false; INIT_WORK(&gadget->work, usb_gadget_state_work); gadget->dev.parent = parent; @@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc); void usb_del_gadget(struct usb_gadget *gadget) { struct usb_udc *udc = gadget->udc; + unsigned long flags; if (!udc) return; @@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget) kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); device_del(&gadget->dev); + /* + * Set the teardown flag before flushing the work to prevent new work + * from being scheduled while we are cleaning up. + */ + spin_lock_irqsave(&gadget->state_lock, flags); + gadget->teardown = true; + spin_unlock_irqrestore(&gadget->state_lock, flags); flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); diff --git a/drivers/usb/gadget/udc/renesas_usbf.c b/drivers/usb/gadget/udc/renesas_usbf.c index 14f4b2cf05a4..4c201574a0af 100644 --- a/drivers/usb/gadget/udc/renesas_usbf.c +++ b/drivers/usb/gadget/udc/renesas_usbf.c @@ -3262,7 +3262,9 @@ static int usbf_probe(struct platform_device *pdev) if (IS_ERR(udc->regs)) return PTR_ERR(udc->regs); - devm_pm_runtime_enable(&pdev->dev); + ret = devm_pm_runtime_enable(&pdev->dev); + if (ret) + return ret; ret = pm_runtime_resume_and_get(&pdev->dev); if (ret < 0) return ret; diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h index 47ac72c2286d..5426c971d2d3 100644 --- a/drivers/usb/host/xhci-dbgcap.h +++ b/drivers/usb/host/xhci-dbgcap.h @@ -114,6 +114,7 @@ struct dbc_port { unsigned int tx_boundary; bool registered; + bool tx_running; }; struct dbc_driver { diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c index d894081d8d15..57cdda4e09c8 100644 --- a/drivers/usb/host/xhci-dbgtty.c +++ b/drivers/usb/host/xhci-dbgtty.c @@ -47,7 +47,7 @@ dbc_kfifo_to_req(struct dbc_port *port, char *packet) return len; } -static int dbc_start_tx(struct dbc_port *port) +static int dbc_do_start_tx(struct dbc_port *port) __releases(&port->port_lock) __acquires(&port->port_lock) { @@ -57,6 +57,8 @@ static int dbc_start_tx(struct dbc_port *port) bool do_tty_wake = false; struct list_head *pool = &port->write_pool; + port->tx_running = true; + while (!list_empty(pool)) { req = list_entry(pool->next, struct dbc_request, list_pool); len = dbc_kfifo_to_req(port, req->buf); @@ -77,12 +79,25 @@ static int dbc_start_tx(struct dbc_port *port) } } + port->tx_running = false; + if (do_tty_wake && port->port.tty) tty_wakeup(port->port.tty); return status; } +/* must be called with port->port_lock held */ +static int dbc_start_tx(struct dbc_port *port) +{ + lockdep_assert_held(&port->port_lock); + + if (port->tx_running) + return -EBUSY; + + return dbc_do_start_tx(port); +} + static void dbc_start_rx(struct dbc_port *port) __releases(&port->port_lock) __acquires(&port->port_lock) @@ -535,6 +550,12 @@ static void xhci_dbc_tty_unregister_device(struct xhci_dbc *dbc) if (!port->registered) return; + /* + * Hang up the TTY. This wakes up any blocked + * writers and causes subsequent writes to fail. + */ + tty_vhangup(port->port.tty); + tty_unregister_device(dbc_tty_driver, port->minor); xhci_dbc_tty_exit_port(port); port->registered = false; diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 8e209aa33ea7..5bdcf9ab2b99 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -1985,6 +1985,7 @@ static void xhci_cavium_reset_phy_quirk(struct xhci_hcd *xhci) static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event) { + struct xhci_virt_device *vdev = NULL; struct usb_hcd *hcd; u32 port_id; u32 portsc, cmd_reg; @@ -2016,6 +2017,9 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event) goto cleanup; } + if (port->slot_id) + vdev = xhci->devs[port->slot_id]; + /* We might get interrupts after shared_hcd is removed */ if (port->rhub == &xhci->usb3_rhub && xhci->shared_hcd == NULL) { xhci_dbg(xhci, "ignore port event for removed USB3 hcd\n"); @@ -2038,10 +2042,11 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event) usb_hcd_resume_root_hub(hcd); } - if (hcd->speed >= HCD_USB3 && - (portsc & PORT_PLS_MASK) == XDEV_INACTIVE) { - if (port->slot_id && xhci->devs[port->slot_id]) - xhci->devs[port->slot_id]->flags |= VDEV_PORT_ERROR; + if (vdev && (portsc & PORT_PLS_MASK) == XDEV_INACTIVE) { + if (!(portsc & PORT_RESET)) + vdev->flags |= VDEV_PORT_ERROR; + } else if (vdev && portsc & PORT_RC) { + vdev->flags &= ~VDEV_PORT_ERROR; } if ((portsc & PORT_PLC) && (portsc & PORT_PLS_MASK) == XDEV_RESUME) { @@ -2099,7 +2104,7 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event) * so the roothub behavior is consistent with external * USB 3.0 hub behavior. */ - if (port->slot_id && xhci->devs[port->slot_id]) + if (vdev) xhci_ring_device(xhci, port->slot_id); if (bus_state->port_remote_wakeup & (1 << hcd_portnum)) { xhci_test_and_clear_bit(xhci, port, PORT_PLC); diff --git a/drivers/usb/host/xhci-sideband.c b/drivers/usb/host/xhci-sideband.c index e771a476fef2..a85f62a73313 100644 --- a/drivers/usb/host/xhci-sideband.c +++ b/drivers/usb/host/xhci-sideband.c @@ -73,9 +73,12 @@ err: return NULL; } +/* Caller must hold sb->mutex */ static void __xhci_sideband_remove_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *ep) { + lockdep_assert_held(&sb->mutex); + /* * Issue a stop endpoint command when an endpoint is removed. * The stop ep cmd handler will handle the ring cleanup. @@ -86,6 +89,25 @@ __xhci_sideband_remove_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *e sb->eps[ep->ep_index] = NULL; } +/* Caller must hold sb->mutex */ +static void +__xhci_sideband_remove_interrupter(struct xhci_sideband *sb) +{ + struct usb_device *udev; + + lockdep_assert_held(&sb->mutex); + + if (!sb->ir) + return; + + xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir); + sb->ir = NULL; + udev = sb->vdev->udev; + + if (udev->state != USB_STATE_NOTATTACHED) + usb_offload_put(udev); +} + /* sideband api functions */ /** @@ -131,14 +153,16 @@ xhci_sideband_add_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *ep; unsigned int ep_index; - mutex_lock(&sb->mutex); + guard(mutex)(&sb->mutex); + + if (!sb->vdev) + return -ENODEV; + ep_index = xhci_get_endpoint_index(&host_ep->desc); ep = &sb->vdev->eps[ep_index]; - if (ep->ep_state & EP_HAS_STREAMS) { - mutex_unlock(&sb->mutex); + if (ep->ep_state & EP_HAS_STREAMS) return -EINVAL; - } /* * Note, we don't know the DMA mask of the audio DSP device, if its @@ -148,14 +172,11 @@ xhci_sideband_add_endpoint(struct xhci_sideband *sb, * and let this function add the endpoint and allocate the ring buffer * with the smallest common DMA mask */ - if (sb->eps[ep_index] || ep->sideband) { - mutex_unlock(&sb->mutex); + if (sb->eps[ep_index] || ep->sideband) return -EBUSY; - } ep->sideband = sb; sb->eps[ep_index] = ep; - mutex_unlock(&sb->mutex); return 0; } @@ -180,18 +201,16 @@ xhci_sideband_remove_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *ep; unsigned int ep_index; - mutex_lock(&sb->mutex); + guard(mutex)(&sb->mutex); + ep_index = xhci_get_endpoint_index(&host_ep->desc); ep = sb->eps[ep_index]; - if (!ep || !ep->sideband || ep->sideband != sb) { - mutex_unlock(&sb->mutex); + if (!ep || !ep->sideband || ep->sideband != sb) return -ENODEV; - } __xhci_sideband_remove_endpoint(sb, ep); xhci_initialize_ring_info(ep->ring); - mutex_unlock(&sb->mutex); return 0; } @@ -316,28 +335,25 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, if (!sb || !sb->xhci) return -ENODEV; - mutex_lock(&sb->mutex); - if (sb->ir) { - ret = -EBUSY; - goto out; - } + guard(mutex)(&sb->mutex); + + if (!sb->vdev) + return -ENODEV; + + if (sb->ir) + return -EBUSY; sb->ir = xhci_create_secondary_interrupter(xhci_to_hcd(sb->xhci), num_seg, imod_interval, intr_num); - if (!sb->ir) { - ret = -ENOMEM; - goto out; - } + if (!sb->ir) + return -ENOMEM; udev = sb->vdev->udev; ret = usb_offload_get(udev); sb->ir->ip_autoclear = ip_autoclear; -out: - mutex_unlock(&sb->mutex); - return ret; } EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter); @@ -352,21 +368,12 @@ EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter); void xhci_sideband_remove_interrupter(struct xhci_sideband *sb) { - struct usb_device *udev; - - if (!sb || !sb->ir) + if (!sb) return; - mutex_lock(&sb->mutex); - xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir); - - sb->ir = NULL; - udev = sb->vdev->udev; + guard(mutex)(&sb->mutex); - if (udev->state != USB_STATE_NOTATTACHED) - usb_offload_put(udev); - - mutex_unlock(&sb->mutex); + __xhci_sideband_remove_interrupter(sb); } EXPORT_SYMBOL_GPL(xhci_sideband_remove_interrupter); @@ -465,6 +472,7 @@ EXPORT_SYMBOL_GPL(xhci_sideband_register); void xhci_sideband_unregister(struct xhci_sideband *sb) { + struct xhci_virt_device *vdev; struct xhci_hcd *xhci; int i; @@ -473,17 +481,23 @@ xhci_sideband_unregister(struct xhci_sideband *sb) xhci = sb->xhci; - mutex_lock(&sb->mutex); - for (i = 0; i < EP_CTX_PER_DEV; i++) - if (sb->eps[i]) - __xhci_sideband_remove_endpoint(sb, sb->eps[i]); - mutex_unlock(&sb->mutex); + scoped_guard(mutex, &sb->mutex) { + vdev = sb->vdev; + if (!vdev) + return; + + for (i = 0; i < EP_CTX_PER_DEV; i++) + if (sb->eps[i]) + __xhci_sideband_remove_endpoint(sb, sb->eps[i]); - xhci_sideband_remove_interrupter(sb); + __xhci_sideband_remove_interrupter(sb); + + sb->vdev = NULL; + } spin_lock_irq(&xhci->lock); sb->xhci = NULL; - sb->vdev->sideband = NULL; + vdev->sideband = NULL; spin_unlock_irq(&xhci->lock); kfree(sb); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 0cb45b95e4f5..a148a1280126 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4007,6 +4007,7 @@ static int xhci_discover_or_reset_device(struct usb_hcd *hcd, xhci_get_slot_state(xhci, virt_dev->out_ctx)); xhci_dbg(xhci, "Not freeing device rings.\n"); /* Don't treat this as an error. May change my mind later. */ + virt_dev->flags = 0; ret = 0; goto command_cleanup; case COMP_SUCCESS: diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c index 8f536f2c500f..dc2fec9168b7 100644 --- a/drivers/usb/renesas_usbhs/common.c +++ b/drivers/usb/renesas_usbhs/common.c @@ -813,18 +813,18 @@ static void usbhs_remove(struct platform_device *pdev) flush_delayed_work(&priv->notify_hotplug_work); - /* power off */ - if (!usbhs_get_dparam(priv, runtime_pwctrl)) - usbhsc_power_ctrl(priv, 0); - - pm_runtime_disable(&pdev->dev); - usbhs_platform_call(priv, hardware_exit, pdev); - usbhsc_clk_put(priv); reset_control_assert(priv->rsts); usbhs_mod_remove(priv); usbhs_fifo_remove(priv); usbhs_pipe_remove(priv); + + /* power off */ + if (!usbhs_get_dparam(priv, runtime_pwctrl)) + usbhsc_power_ctrl(priv, 0); + + usbhsc_clk_put(priv); + pm_runtime_disable(&pdev->dev); } static int usbhsc_suspend(struct device *dev) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 49666c33b41f..b37fa31f5694 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1074,6 +1074,7 @@ static const struct usb_device_id id_table_combined[] = { /* U-Blox devices */ { USB_DEVICE(UBLOX_VID, UBLOX_C099F9P_ZED_PID) }, { USB_DEVICE(UBLOX_VID, UBLOX_C099F9P_ODIN_PID) }, + { USB_DEVICE_INTERFACE_NUMBER(UBLOX_VID, UBLOX_EVK_M101_PID, 2) }, /* FreeCalypso USB adapters */ { USB_DEVICE(FTDI_VID, FTDI_FALCONIA_JTAG_BUF_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 4cc1fae8acb9..2539b9e2f712 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -1614,6 +1614,7 @@ #define UBLOX_VID 0x1546 #define UBLOX_C099F9P_ZED_PID 0x0502 #define UBLOX_C099F9P_ODIN_PID 0x0503 +#define UBLOX_EVK_M101_PID 0x0506 /* * GMC devices diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 5de856f65f0d..e9400727ad36 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2424,12 +2424,18 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) }, /* GosunCn GM500 ECM/NCM */ { USB_DEVICE(0x33f8, 0x0104), /* Rolling RW101-GL (laptop RMNET) */ .driver_info = RSVD(4) | RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */ + .driver_info = RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a2, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a3, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a4, 0xff), /* Rolling RW101-GL (laptop MBIM) */ .driver_info = RSVD(4) }, - { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */ - .driver_info = RSVD(5) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a8, 0xff), /* Rolling RW101R-GL (laptop MBIM) */ + .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a9, 0xff), /* Rolling RW101R-GL (laptop MBIM) */ + .driver_info = RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0301, 0xff) }, /* Rolling RW101R-GL (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0302, 0xff) }, /* Rolling RW101R-GL (laptop MBIM) */ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0802, 0xff), /* Rolling RW350-GL (laptop MBIM) */ .driver_info = RSVD(5) }, { USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Global */ diff --git a/drivers/usb/storage/sddr55.c b/drivers/usb/storage/sddr55.c index b323f0a36260..9d813727e65f 100644 --- a/drivers/usb/storage/sddr55.c +++ b/drivers/usb/storage/sddr55.c @@ -469,6 +469,12 @@ static int sddr55_write_data(struct us_data *us, new_pba = (status[3] + (status[4] << 8) + (status[5] << 16)) >> info->blockshift; + /* check if device-reported new_pba is out of range */ + if (new_pba >= (info->capacity >> (info->blockshift + info->pageshift))) { + result = USB_STOR_TRANSPORT_FAILED; + goto leave; + } + /* check status for error */ if (status[0] == 0xff && status[1] == 0x4) { info->pba_to_lba[new_pba] = BAD_BLOCK; diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 1aa1bd26c81f..9a4bf86e7b6a 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -1200,7 +1200,23 @@ int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us) US_BULK_CS_WRAP_LEN && bcs->Signature == cpu_to_le32(US_BULK_CS_SIGN)) { + unsigned char buf[US_BULK_CS_WRAP_LEN]; + usb_stor_dbg(us, "Device skipped data phase\n"); + + /* + * Devices skipping data phase might leave CSW data in srb's + * transfer buffer. Zero it to prevent USB protocol leakage. + */ + sg = NULL; + offset = 0; + memset(buf, 0, sizeof(buf)); + if (usb_stor_access_xfer_buf(buf, + US_BULK_CS_WRAP_LEN, srb, &sg, + &offset, TO_XFER_BUF) != + US_BULK_CS_WRAP_LEN) + usb_stor_dbg(us, "Failed to clear CSW data\n"); + scsi_set_resid(srb, transfer_length); goto skipped_data_phase; } diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 4ed0dc19afe0..45b01df364f7 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -698,6 +698,10 @@ static int uas_queuecommand_lck(struct scsi_cmnd *cmnd) * of queueing, no matter how fatal the error */ if (err == -ENODEV) { + if (cmdinfo->state & (COMMAND_INFLIGHT | DATA_IN_URB_INFLIGHT | + DATA_OUT_URB_INFLIGHT)) + goto out; + set_host_byte(cmnd, DID_NO_CONNECT); scsi_done(cmnd); goto zombie; @@ -711,6 +715,7 @@ static int uas_queuecommand_lck(struct scsi_cmnd *cmnd) uas_add_work(cmnd); } +out: devinfo->cmnd[idx] = cmnd; zombie: spin_unlock_irqrestore(&devinfo->lock, flags); diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index dfa5276a5a43..47f50d7a385c 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -938,7 +938,7 @@ UNUSUAL_DEV( 0x05e3, 0x0723, 0x9451, 0x9451, UNUSUAL_DEV( 0x0603, 0x8611, 0x0000, 0xffff, "Novatek", "NTK96550-based camera", - USB_SC_SCSI, USB_PR_BULK, NULL, + USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_BULK_IGNORE_TAG ), /* diff --git a/drivers/usb/typec/ucsi/psy.c b/drivers/usb/typec/ucsi/psy.c index 62a9d68bb66d..8ae900c8c132 100644 --- a/drivers/usb/typec/ucsi/psy.c +++ b/drivers/usb/typec/ucsi/psy.c @@ -145,6 +145,11 @@ static int ucsi_psy_get_current_max(struct ucsi_connector *con, { u32 pdo; + if (!UCSI_CONSTAT(con, CONNECTED)) { + val->intval = 0; + return 0; + } + switch (UCSI_CONSTAT(con, PWR_OPMODE)) { case UCSI_CONSTAT_PWR_OPMODE_PD: if (con->num_pdos > 0) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 82034efb74fc..a7936bd1aabe 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -573,6 +573,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) vcq->mcq.set_ci_db = vcq->db.db; vcq->mcq.arm_db = vcq->db.db + 1; vcq->mcq.cqe_sz = 64; + vcq->mcq.comp = mlx5_vdpa_cq_comp; + vcq->cqe = num_ent; err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent); if (err) @@ -612,10 +614,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) if (err) goto err_vec; - vcq->mcq.comp = mlx5_vdpa_cq_comp; - vcq->cqe = num_ent; - vcq->mcq.set_ci_db = vcq->db.db; - vcq->mcq.arm_db = vcq->db.db + 1; mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index); kfree(in); return 0; diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index c376a6279de0..d47ffada6912 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -299,10 +299,8 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, char __user *arg) { struct vfio_device *device; - struct file *filep; char *buf; - int fdno; - int ret; + int fd; buf = strndup_user(arg, PAGE_SIZE); if (IS_ERR(buf)) @@ -313,26 +311,10 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, if (IS_ERR(device)) return PTR_ERR(device); - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) { - ret = fdno; - goto err_put_device; - } - - filep = vfio_device_open_file(device); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_put_fdno; - } - - fd_install(fdno, filep); - return fdno; - -err_put_fdno: - put_unused_fd(fdno); -err_put_device: - vfio_device_put_registration(device); - return ret; + fd = FD_ADD(O_CLOEXEC, vfio_device_open_file(device)); + if (fd < 0) + vfio_device_put_registration(device); + return fd; } static int vfio_group_ioctl_get_status(struct vfio_group *group, diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 35ded4330431..8f7f50acb6d6 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net, static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_net_virtqueue *tnvq, unsigned int *out_num, unsigned int *in_num, - struct msghdr *msghdr, bool *busyloop_intr) + struct msghdr *msghdr, bool *busyloop_intr, + unsigned int *ndesc) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *rvq = &rnvq->vq; struct vhost_virtqueue *tvq = &tnvq->vq; - int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), - out_num, in_num, NULL, NULL); + int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), + out_num, in_num, NULL, NULL, ndesc); if (r == tvq->num && tvq->busyloop_timeout) { /* Flush batched packets first */ @@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false); - r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), - out_num, in_num, NULL, NULL); + r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov), + out_num, in_num, NULL, NULL, ndesc); } return r; @@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net, struct vhost_net_virtqueue *nvq, struct msghdr *msg, unsigned int *out, unsigned int *in, - size_t *len, bool *busyloop_intr) + size_t *len, bool *busyloop_intr, + unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; int ret; - ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr); + ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, + busyloop_intr, ndesc); if (ret < 0 || ret == vq->num) return ret; @@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); + unsigned int ndesc = 0; do { bool busyloop_intr = false; @@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) vhost_tx_batch(net, nvq, sock, &msg); head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, - &busyloop_intr); + &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; @@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) goto done; } else if (unlikely(err != -ENOSPC)) { vhost_tx_batch(net, nvq, sock, &msg); - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) { - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) int err; struct vhost_net_ubuf_ref *ubufs; struct ubuf_info_msgzc *ubuf; + unsigned int ndesc = 0; bool zcopy_used; int sent_pkts = 0; @@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) busyloop_intr = false; head = get_tx_bufs(net, nvq, &msg, &out, &in, &len, - &busyloop_intr); + &busyloop_intr, &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; @@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; } if (retry) { - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, 1, ndesc); vhost_net_enable_vq(net, vq); break; } @@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, - unsigned int quota) + unsigned int quota, + unsigned int *ndesc) { struct vhost_virtqueue *vq = &nvq->vq; bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); - unsigned int out, in; + unsigned int out, in, desc_num, n = 0; int seg = 0; int headcount = 0; unsigned d; @@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, r = -ENOBUFS; goto err; } - r = vhost_get_vq_desc(vq, vq->iov + seg, - ARRAY_SIZE(vq->iov) - seg, &out, - &in, log, log_num); + r = vhost_get_vq_desc_n(vq, vq->iov + seg, + ARRAY_SIZE(vq->iov) - seg, &out, + &in, log, log_num, &desc_num); if (unlikely(r < 0)) goto err; @@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, ++headcount; datalen -= len; seg += in; + n += desc_num; } *iovcount = seg; @@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq, nheads[0] = headcount; } + *ndesc = n; + return headcount; err: - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, n); return r; } @@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net) struct iov_iter fixup; __virtio16 num_buffers; int recv_pkts = 0; + unsigned int ndesc; mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX); sock = vhost_vq_get_backend(vq); @@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net) headcount = get_rx_bufs(nvq, vq->heads + count, vq->nheads + count, vhost_len, &in, vq_log, &log, - likely(mergeable) ? UIO_MAXIOV : 1); + likely(mergeable) ? UIO_MAXIOV : 1, + &ndesc); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; @@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net) if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, ndesc); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ @@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net) copy_to_iter(&num_buffers, sizeof num_buffers, &fixup) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, headcount, ndesc); goto out; } nvq->done_idx += headcount; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 8570fdf2e14a..a78226b37739 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2792,18 +2792,34 @@ static int get_indirect(struct vhost_virtqueue *vq, return 0; } -/* This looks in the virtqueue and for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. +/** + * vhost_get_vq_desc_n - Fetch the next available descriptor chain and build iovecs + * @vq: target virtqueue + * @iov: array that receives the scatter/gather segments + * @iov_size: capacity of @iov in elements + * @out_num: the number of output segments + * @in_num: the number of input segments + * @log: optional array to record addr/len for each writable segment; NULL if unused + * @log_num: optional output; number of entries written to @log when provided + * @ndesc: optional output; number of descriptors consumed from the available ring + * (useful for rollback via vhost_discard_vq_desc) * - * This function returns the descriptor number found, or vq->num (which is - * never a valid descriptor number) if none was found. A negative code is - * returned on error. */ -int vhost_get_vq_desc(struct vhost_virtqueue *vq, - struct iovec iov[], unsigned int iov_size, - unsigned int *out_num, unsigned int *in_num, - struct vhost_log *log, unsigned int *log_num) + * Extracts one available descriptor chain from @vq and translates guest addresses + * into host iovecs. + * + * On success, advances @vq->last_avail_idx by 1 and @vq->next_avail_head by the + * number of descriptors consumed (also stored via @ndesc when non-NULL). + * + * Return: + * - head index in [0, @vq->num) on success; + * - @vq->num if no descriptor is currently available; + * - negative errno on failure + */ +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + unsigned int *ndesc) { bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct vring_desc desc; @@ -2921,17 +2937,49 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, vq->last_avail_idx++; vq->next_avail_head += c; + if (ndesc) + *ndesc = c; + /* Assume notifications from guest are disabled at this point, * if they aren't we would need to update avail_event index. */ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); return head; } +EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n); + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which is + * never a valid descriptor number) if none was found. A negative code is + * returned on error. + */ +int vhost_get_vq_desc(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num) +{ + return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num, + log, log_num, NULL); +} EXPORT_SYMBOL_GPL(vhost_get_vq_desc); -/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ -void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n) +/** + * vhost_discard_vq_desc - Reverse the effect of vhost_get_vq_desc_n() + * @vq: target virtqueue + * @nbufs: number of buffers to roll back + * @ndesc: number of descriptors to roll back + * + * Rewinds the internal consumer cursors after a failed attempt to use buffers + * returned by vhost_get_vq_desc_n(). + */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int nbufs, + unsigned int ndesc) { - vq->last_avail_idx -= n; + vq->next_avail_head -= ndesc; + vq->last_avail_idx -= nbufs; } EXPORT_SYMBOL_GPL(vhost_discard_vq_desc); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 621a6d9a8791..b49f08e4a1b4 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -230,7 +230,15 @@ int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num); -void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); + +int vhost_get_vq_desc_n(struct vhost_virtqueue *vq, + struct iovec iov[], unsigned int iov_size, + unsigned int *out_num, unsigned int *in_num, + struct vhost_log *log, unsigned int *log_num, + unsigned int *ndesc); + +void vhost_discard_vq_desc(struct vhost_virtqueue *, int nbuf, + unsigned int ndesc); bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work); bool vhost_vq_has_work(struct vhost_virtqueue *vq); diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 9bd3c3814b5c..e7e07eb2142e 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -66,6 +66,7 @@ #include <linux/string.h> #include <linux/kd.h> #include <linux/panic.h> +#include <linux/pci.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/fb.h> @@ -78,6 +79,7 @@ #include <linux/interrupt.h> #include <linux/crc32.h> /* For counting font checksums */ #include <linux/uaccess.h> +#include <linux/vga_switcheroo.h> #include <asm/irq.h> #include "fbcon.h" @@ -2899,6 +2901,9 @@ void fbcon_fb_unregistered(struct fb_info *info) console_lock(); + if (info->device && dev_is_pci(info->device)) + vga_switcheroo_client_fb_set(to_pci_dev(info->device), NULL); + fbcon_registered_fb[info->node] = NULL; fbcon_num_registered_fb--; @@ -3032,6 +3037,10 @@ static int do_fb_registered(struct fb_info *info) } } + /* Set the fb info for vga_switcheroo clients. Does nothing otherwise. */ + if (info->device && dev_is_pci(info->device)) + vga_switcheroo_client_fb_set(to_pci_dev(info->device), info); + return ret; } diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 528682bf0c7f..f794014814be 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -410,7 +410,7 @@ static char *join(const char *dir, const char *name) return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; } -static char **split(char *strings, unsigned int len, unsigned int *num) +static char **split_strings(char *strings, unsigned int len, unsigned int *num) { char *p, **ret; @@ -448,7 +448,7 @@ char **xenbus_directory(struct xenbus_transaction t, if (IS_ERR(strings)) return ERR_CAST(strings); - return split(strings, len, num); + return split_strings(strings, len, num); } EXPORT_SYMBOL_GPL(xenbus_directory); diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eed551d8555f..633da5e37299 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <linux/slab.h> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index eb0b083da269..612a230bc012 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { - struct inode *inode; - - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_ALL, - .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, - /* absolute end, byte at end included */ - .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + - (vma->vm_end - vma->vm_start - 1), - }; - if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); - inode = file_inode(vma->vm_file); - filemap_fdatawrite_wbc(inode->i_mapping, &wbc); + filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping, + (loff_t)vma->vm_pgoff * PAGE_SIZE, + (loff_t)vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1)); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index d0c77ec31b1d..8666c9c62258 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index be297e335468..1661a25f2772 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; /* * initialize the inode with the stat info diff --git a/fs/Makefile b/fs/Makefile index e3523ab2e587..a04274a3c854 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ file_attr.o diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0210df8d3500..0bfc7d151dcd 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; pr_debug("affs_iget(%lu)\n", inode->i_ino); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index f31359922e98..71c10a05cebe 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } - cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL); + /* Allocate the cell name and the key name in one go. */ + cell->name = kmalloc(1 + namelen + 1 + + 4 + namelen + 1, GFP_KERNEL); if (!cell->name) { kfree(cell); return ERR_PTR(-ENOMEM); @@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); - cell->name[i] = 0; + cell->name[i++] = 0; + + cell->key_desc = cell->name + i; + memcpy(cell->key_desc, "afs@", 4); + memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1); cell->net = net; refcount_set(&cell->ref, 1); @@ -229,7 +235,7 @@ error: * @name: The name of the cell. * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. - * @excl: T if an error should be given if the cell name already exists. + * @reason: The reason we're doing the lookup * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if @@ -239,7 +245,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; @@ -247,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, enum afs_cell_state state; int ret, n; - _enter("%s,%s", name, vllist); + _enter("%s,%s,%u", name, vllist, reason); - if (!excl) { + if (reason != AFS_LOOKUP_CELL_PRELOAD) { cell = afs_find_cell(net, name, namesz, trace); - if (!IS_ERR(cell)) + if (!IS_ERR(cell)) { + if (reason == AFS_LOOKUP_CELL_DYNROOT) + goto no_wait; + if (cell->state == AFS_CELL_SETTING_UP || + cell->state == AFS_CELL_UNLOOKED) + goto lookup_cell; goto wait_for_cell; + } } /* Assume we're probably going to create a cell and preallocate and @@ -298,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_queue_new); +lookup_cell: + if (reason != AFS_LOOKUP_CELL_PRELOAD && + reason != AFS_LOOKUP_CELL_ROOTCELL) { + set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); + afs_queue_cell(cell, afs_cell_trace_queue_new); + } wait_for_cell: - _debug("wait_for_cell"); state = smp_load_acquire(&cell->state); /* vs error */ - if (state != AFS_CELL_ACTIVE && - state != AFS_CELL_DEAD) { + switch (state) { + case AFS_CELL_ACTIVE: + case AFS_CELL_DEAD: + break; + case AFS_CELL_UNLOOKED: + default: + if (reason == AFS_LOOKUP_CELL_PRELOAD || + reason == AFS_LOOKUP_CELL_ROOTCELL) + break; + _debug("wait_for_cell"); afs_see_cell(cell, afs_cell_trace_wait); wait_var_event(&cell->state, ({ state = smp_load_acquire(&cell->state); /* vs error */ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; })); + _debug("waited_for_cell %d %d", cell->state, cell->error); } +no_wait: /* Check the state obtained from the wait check. */ + state = smp_load_acquire(&cell->state); /* vs error */ if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } + if (state == AFS_CELL_ACTIVE) { + switch (cell->dns_status) { + case DNS_LOOKUP_NOT_DONE: + if (cell->dns_source == DNS_RECORD_FROM_CONFIG) { + ret = 0; + break; + } + fallthrough; + default: + ret = -EIO; + goto error; + case DNS_LOOKUP_GOOD: + case DNS_LOOKUP_GOOD_WITH_BAD: + ret = 0; + break; + case DNS_LOOKUP_GOT_NOT_FOUND: + ret = -ENOENT; + goto error; + case DNS_LOOKUP_BAD: + ret = -EREMOTEIO; + goto error; + case DNS_LOOKUP_GOT_LOCAL_FAILURE: + case DNS_LOOKUP_GOT_TEMP_FAILURE: + case DNS_LOOKUP_GOT_NS_FAILURE: + ret = -EDESTADDRREQ; + goto error; + } + } _leave(" = %p [cell]", cell); return cell; @@ -325,7 +381,7 @@ wait_for_cell: cell_already_exists: _debug("cell exists"); cell = cursor; - if (excl) { + if (reason == AFS_LOOKUP_CELL_PRELOAD) { ret = -EEXIST; } else { afs_use_cell(cursor, trace); @@ -384,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) return -EINVAL; /* allocate a cell record for the root/workstation cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + new_root = afs_lookup_cell(net, rootcell, len, vllist, + AFS_LOOKUP_CELL_ROOTCELL, afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); @@ -660,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) } /* - * Allocate a key to use as a placeholder for anonymous user security. - */ -static int afs_alloc_anon_key(struct afs_cell *cell) -{ - struct key *key; - char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp; - - /* Create a key to represent an anonymous user. */ - memcpy(keyname, "afs@", 4); - dp = keyname + 4; - cp = cell->name; - do { - *dp++ = tolower(*cp); - } while (*cp++); - - key = rxrpc_get_null_key(keyname); - if (IS_ERR(key)) - return PTR_ERR(key); - - cell->anonymous_key = key; - - _debug("anon key %p{%x}", - cell->anonymous_key, key_serial(cell->anonymous_key)); - return 0; -} - -/* * Activate a cell. */ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) @@ -695,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) struct afs_cell *pcell; int ret; - if (!cell->anonymous_key) { - ret = afs_alloc_anon_key(cell); - if (ret < 0) - return ret; - } - ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -777,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell) switch (cell->state) { case AFS_CELL_SETTING_UP: goto set_up_cell; + case AFS_CELL_UNLOOKED: case AFS_CELL_ACTIVE: goto cell_is_active; case AFS_CELL_REMOVING: @@ -797,7 +822,7 @@ set_up_cell: goto remove_cell; } - afs_set_cell_state(cell, AFS_CELL_ACTIVE); + afs_set_cell_state(cell, AFS_CELL_UNLOOKED); cell_is_active: if (afs_has_cell_expired(cell, &next_manage)) @@ -807,6 +832,8 @@ cell_is_active: ret = afs_update_cell(cell); if (ret < 0) cell->error = ret; + if (cell->state == AFS_CELL_UNLOOKED) + afs_set_cell_state(cell, AFS_CELL_ACTIVE); } if (next_manage < TIME64_MAX && cell->net->live) { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 89d36e3e5c79..f4e9e12373ac 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode; struct inode *inode = NULL, *ti; afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version); - bool supports_ibulk; + bool supports_ibulk, isnew; long ret; int i; @@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) * callback counters. */ ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode, - afs_ilookup5_test_by_fid, &vp->fid); + afs_ilookup5_test_by_fid, &vp->fid, &isnew); if (!IS_ERR_OR_NULL(ti)) { vnode = AFS_FS_I(ti); vp->dv_before = vnode->status.data_version; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 8c6130789fde..aa56e8951e03 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); @@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry * dotted = true; } - cell = afs_lookup_cell(net, name, len, NULL, false, + cell = afs_lookup_cell(net, name, len, NULL, + AFS_LOOKUP_CELL_DYNROOT, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); @@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry vnode = AFS_FS_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 1); @@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb) vnode = AFS_FS_I(inode); /* there shouldn't be an existing inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e1cb17b85791..dde1857fcabb 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->netfs.inode.i_state & I_NEW) { + if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); afs_op_set_error(op, ret); if (ret == 0) @@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* deal with an existing inode */ - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { _leave(" = %p", inode); return inode; } @@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) _debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid); - BUG_ON(!(inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(inode) & I_NEW)); vnode = AFS_FS_I(inode); vnode->cb_v_check = atomic_read(&as->volume->cb_v_break); @@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode) if ((S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - (inode->i_state & I_DIRTY) && + (inode_state_read_once(inode) & I_DIRTY) && !sbi->dyn_root) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a45ae5c2ef8a..009064b8d661 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -343,6 +343,7 @@ extern const char afs_init_sysname[]; enum afs_cell_state { AFS_CELL_SETTING_UP, + AFS_CELL_UNLOOKED, AFS_CELL_ACTIVE, AFS_CELL_REMOVING, AFS_CELL_DEAD, @@ -412,6 +413,7 @@ struct afs_cell { u8 name_len; /* Length of name */ char *name; /* Cell name, case-flattened and NUL-padded */ + char *key_desc; /* Authentication key description */ }; /* @@ -1049,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); +enum afs_lookup_cell_for { + AFS_LOOKUP_CELL_DYNROOT, + AFS_LOOKUP_CELL_MOUNTPOINT, + AFS_LOOKUP_CELL_DIRECT_MOUNT, + AFS_LOOKUP_CELL_PRELOAD, + AFS_LOOKUP_CELL_ROOTCELL, + AFS_LOOKUP_CELL_ALIAS_CHECK, +}; struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl, + const char *vllist, + enum afs_lookup_cell_for reason, enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 1ad048e6e164..57c204a3c04e 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + cell = afs_lookup_cell(ctx->net, p, size, NULL, + AFS_LOOKUP_CELL_MOUNTPOINT, afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 40e879c8ca77..44520549b509 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true, + cell = afs_lookup_cell(net, name, strlen(name), args, + AFS_LOOKUP_CELL_PRELOAD, afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); diff --git a/fs/afs/security.c b/fs/afs/security.c index 6a7744c9e2a2..55ddce94af03 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -16,6 +16,31 @@ static DEFINE_HASHTABLE(afs_permits_cache, 10); static DEFINE_SPINLOCK(afs_permits_lock); +static DEFINE_MUTEX(afs_key_lock); + +/* + * Allocate a key to use as a placeholder for anonymous user security. + */ +static int afs_alloc_anon_key(struct afs_cell *cell) +{ + struct key *key; + + mutex_lock(&afs_key_lock); + key = cell->anonymous_key; + if (!key) { + key = rxrpc_get_null_key(cell->key_desc); + if (!IS_ERR(key)) + cell->anonymous_key = key; + } + mutex_unlock(&afs_key_lock); + + if (IS_ERR(key)) + return PTR_ERR(key); + + _debug("anon key %p{%x}", + cell->anonymous_key, key_serial(cell->anonymous_key)); + return 0; +} /* * get a key @@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock); struct key *afs_request_key(struct afs_cell *cell) { struct key *key; + int ret; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net(&key_type_rxrpc, cell->key_desc, cell->net->net, NULL); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell) return key; } + if (!cell->anonymous_key) { + ret = afs_alloc_anon_key(cell); + if (ret < 0) + return ERR_PTR(ret); + } + /* act as anonymous user */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); @@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) { struct key *key; - _enter("{%x}", key_serial(cell->anonymous_key)); + _enter("{%s}", cell->key_desc); - _debug("key %s", cell->anonymous_key->description); - key = request_key_net_rcu(&key_type_rxrpc, - cell->anonymous_key->description, + _debug("key %s", cell->key_desc); + key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc, cell->net->net); if (IS_ERR(key)) { if (PTR_ERR(key) != -ENOKEY) { @@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell) } /* act as anonymous user */ + if (!cell->anonymous_key) + return NULL; /* Need to allocate */ _leave(" = {%x} [anon]", key_serial(cell->anonymous_key)); return key_get(cell->anonymous_key); } else { @@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode, if (mask & MAY_NOT_BLOCK) { key = afs_request_key_rcu(vnode->volume->cell); - if (IS_ERR(key)) + if (IS_ERR_OR_NULL(key)) return -ECHILD; ret = -ECHILD; diff --git a/fs/afs/super.c b/fs/afs/super.c index da407f2d6f0d..d672b7ab57ae 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false, + NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT, afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index 709b4cdb723e..fc9676abd252 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, + AFS_LOOKUP_CELL_ALIAS_CHECK, afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) @@ -1640,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - const struct cred *old_cred = override_creds(iocb->fsync.creds); - iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); - revert_creds(old_cred); + scoped_with_creds(iocb->fsync.creds) + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + put_cred(iocb->fsync.creds); iocb_put(iocb); } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 180a458fc4f7..b8381c7fb636 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -280,27 +280,8 @@ static int __anon_inode_getfd(const char *name, const struct inode *context_inode, bool make_inode) { - int error, fd; - struct file *file; - - error = get_unused_fd_flags(flags); - if (error < 0) - return error; - fd = error; - - file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - make_inode); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - return error; + return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags, + context_inode, make_inode)); } /** diff --git a/fs/attr.c b/fs/attr.c index 795f231d00e8..b9ec6b47bab2 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -415,7 +415,7 @@ EXPORT_SYMBOL(may_setattr); * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr, struct inode **delegated_inode) + struct iattr *attr, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 23cea74f9933..4fd555528c5d 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -16,6 +16,7 @@ #include <linux/wait.h> #include <linux/sched.h> #include <linux/sched/signal.h> +#include <uapi/linux/mount.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/uaccess.h> @@ -27,6 +28,9 @@ #include <linux/magic.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include "../mount.h" +#include <linux/ns_common.h> + /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -114,6 +118,7 @@ struct autofs_sb_info { int pipefd; struct file *pipe; struct pid *oz_pgrp; + u64 mnt_ns_id; int version; int sub_version; int min_proto; diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index d8dd150cbd74..a58f9248b0f5 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -231,32 +231,14 @@ static int test_by_type(const struct path *path, void *p) */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { - int err, fd; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (likely(fd >= 0)) { - struct file *filp; - struct path path; - - err = find_autofs_mount(name, &path, test_by_dev, &devid); - if (err) - goto out; - - filp = dentry_open(&path, O_RDONLY, current_cred()); - path_put(&path); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - goto out; - } - - fd_install(fd, filp); - } + struct path path __free(path_put) = {}; + int err; - return fd; + err = find_autofs_mount(name, &path, test_by_dev, &devid); + if (err) + return err; -out: - put_unused_fd(fd); - return err; + return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } /* Open a file descriptor on an autofs mount point */ @@ -381,6 +363,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; + sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index f5c16ffba013..732aee76a24c 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -251,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void) sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; sbi->pipefd = -1; + sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id; set_autofs_type_indirect(&sbi->type); mutex_init(&sbi->wq_mutex); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 174c7205fee4..d10df9d89d1c 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path) if (autofs_oz_mode(sbi)) return NULL; + /* Refuse to trigger mount if current namespace is not the owner + * and the mount is propagation private. + */ + if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) { + if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE) + return ERR_PTR(-EPERM); + } + /* * If an expire request is pending everyone must wait. * If the expire fails we're still mounted so continue diff --git a/fs/backing-file.c b/fs/backing-file.c index 15a7f8031084..45da8600d564 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb) return sb_init_dio_done_wq(sb); } +static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags) +{ + struct backing_aio *aio = NULL; + int ret; + + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + return vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } + + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + return -ENOMEM; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + return ret; +} ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { - struct backing_aio *aio = NULL; - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) @@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; - old_cred = override_creds(ctx->cred); + scoped_with_creds(ctx->cred) + ret = do_backing_file_read_iter(file, iter, iocb, flags); + + if (ctx->accessed) + ctx->accessed(iocb->ki_filp); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + void (*end_write)(struct kiocb *, ssize_t)) +{ + struct backing_aio *aio; + int ret; + if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); - ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); - } else { - ret = -ENOMEM; - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); - if (!aio) - goto out; - - aio->orig_iocb = iocb; - kiocb_clone(&aio->iocb, iocb, get_file(file)); - aio->iocb.ki_complete = backing_aio_rw_complete; - refcount_set(&aio->ref, 2); - ret = vfs_iocb_iter_read(file, &aio->iocb, iter); - backing_aio_put(aio); - if (ret != -EIOCBQUEUED) - backing_aio_cleanup(aio, ret); + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (end_write) + end_write(iocb, ret); + return ret; } -out: - revert_creds(old_cred); - if (ctx->accessed) - ctx->accessed(iocb->ki_filp); + ret = backing_aio_init_wq(iocb); + if (ret) + return ret; + + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + return -ENOMEM; + aio->orig_iocb = iocb; + aio->end_write = end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); return ret; } -EXPORT_SYMBOL_GPL(backing_file_read_iter); ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) @@ -227,46 +267,8 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; - /* - * Stacked filesystems don't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - flags &= ~IOCB_DIO_CALLER_COMP; - - old_cred = override_creds(ctx->cred); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(flags); - - ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); - if (ctx->end_write) - ctx->end_write(iocb, ret); - } else { - struct backing_aio *aio; - - ret = backing_aio_init_wq(iocb); - if (ret) - goto out; - - ret = -ENOMEM; - aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); - if (!aio) - goto out; - - aio->orig_iocb = iocb; - aio->end_write = ctx->end_write; - kiocb_clone(&aio->iocb, iocb, get_file(file)); - aio->iocb.ki_flags = flags; - aio->iocb.ki_complete = backing_aio_queue_completion; - refcount_set(&aio->ref, 2); - ret = vfs_iocb_iter_write(file, &aio->iocb, iter); - backing_aio_put(aio); - if (ret != -EIOCBQUEUED) - backing_aio_cleanup(aio, ret); - } -out: - revert_creds(old_cred); - - return ret; + scoped_with_creds(ctx->cred) + return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write); } EXPORT_SYMBOL_GPL(backing_file_write_iter); @@ -275,15 +277,13 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb, unsigned int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) return -EIO; - old_cred = override_creds(ctx->cred); - ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) + ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); if (ctx->accessed) ctx->accessed(iocb->ki_filp); @@ -297,7 +297,6 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { - const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) @@ -310,11 +309,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (ret) return ret; - old_cred = override_creds(ctx->cred); - file_start_write(out); - ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); - file_end_write(out); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) { + file_start_write(out); + ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); + file_end_write(out); + } if (ctx->end_write) ctx->end_write(iocb, ret); @@ -326,7 +325,6 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write); int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx) { - const struct cred *old_cred; struct file *user_file = vma->vm_file; int ret; @@ -338,9 +336,8 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma, vma_set_file(vma, file); - old_cred = override_creds(ctx->cred); - ret = vfs_mmap(vma->vm_file, vma); - revert_creds(old_cred); + scoped_with_creds(ctx->cred) + ret = vfs_mmap(vma->vm_file, vma); if (ctx->accessed) ctx->accessed(user_file); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8f430ff8e445..9fcfdd6b8189 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; befs_ino = BEFS_I(inode); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1d41ce477df5..ce6f83234b67 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -42,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { @@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; - inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); + /* + * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that + * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode + * value. This means that, although bfs_write_inode() saves whole + * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX} + * bits), middle 7 bits of di->i_mode value can be garbage when these + * bits were not saved by bfs_write_inode(). + * Since we can't tell whether middle 7 bits are garbage, use only + * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being + * garbage) and reconstruct S_IFMT bits for Linux environment from + * di->i_vtype value. + */ + inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; @@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; + } else { + brelse(bh); + printf("Unknown vtype=%u %s:%08lx\n", + le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino); + goto error; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..3eb734c192e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,7 +46,7 @@ #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> -#include <linux/rseq.h> +#include <uapi/linux/rseq.h> #include <asm/param.h> #include <asm/page.h> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a839f960cd4a..d7aec5b87c2b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -782,8 +782,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { - const struct cred *old_cred; - /* * Now that we support unprivileged binfmt_misc mounts make * sure we use the credentials that the register @file was @@ -791,9 +789,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, * didn't matter much as only a privileged process could open * the register file. */ - old_cred = override_creds(file->f_cred); - f = open_exec(e->interpreter); - revert_creds(old_cred); + scoped_with_creds(file->f_cred) + f = open_exec(e->interpreter); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); @@ -837,8 +834,10 @@ out: inode_unlock(d_inode(root)); if (err) { - if (f) + if (f) { + exe_file_allow_write_access(f); filp_close(f, NULL); + } kfree(e); return err; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5322ef2ae015..08cdda47509f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1850,12 +1850,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_should_reclaim(fs_info)) return; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - sb_end_write(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) return; - } /* * Long running balances can keep us blocked here for eternity, so @@ -1863,7 +1861,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return; } @@ -1947,7 +1944,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) /* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did sb_start_write(), + * for the read-only case. As we did take the super write lock, * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again. @@ -2030,7 +2027,6 @@ end: list_splice_tail(&retry_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index eba188a9e3bb..aee1fd21cdd6 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -85,8 +85,8 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6 { /* @cur must be inside the folio. */ ASSERT(folio_pos(folio) <= cur); - ASSERT(cur < folio_end(folio)); - return min(range_end, folio_end(folio)) - cur; + ASSERT(cur < folio_next_pos(folio)); + return umin(range_end, folio_next_pos(folio)) - cur; } int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 7b277934f66f..a7f20f048398 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -254,10 +254,9 @@ again: range.extent_thresh = defrag->extent_thresh; file_ra_state_init(ra, inode->vfs_inode.i_mapping); - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); + scoped_guard(super_write, fs_info->sb) + ret = btrfs_defrag_file(inode, ra, &range, + defrag->transid, BTRFS_DEFRAG_BATCH); iput(&inode->vfs_inode); if (ret < 0) @@ -886,7 +885,7 @@ again: } lock_start = folio_pos(folio); - lock_end = folio_end(folio) - 1; + lock_end = folio_next_pos(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; @@ -1178,7 +1177,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, if (!folio) break; - if (start >= folio_end(folio) || start + len <= folio_pos(folio)) + if (start >= folio_next_pos(folio) || + start + len <= folio_pos(folio)) continue; btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); @@ -1219,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, folios[i] = NULL; goto free_folios; } - cur = folio_end(folios[i]); + cur = folio_next_pos(folios[i]); } for (int i = 0; i < nr_pages; i++) { if (!folios[i]) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 755ec6dfd51c..7361d5d890d2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -333,7 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, goto out; } range_start = max_t(u64, folio_pos(folio), start); - range_len = min_t(u64, folio_end(folio), end + 1) - range_start; + range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start; btrfs_folio_set_lock(fs_info, folio, range_start, range_len); processed_end = range_start + range_len - 1; @@ -387,7 +387,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, ASSERT(orig_end > orig_start); /* The range should at least cover part of the folio */ - ASSERT(!(orig_start >= folio_end(locked_folio) || + ASSERT(!(orig_start >= folio_next_pos(locked_folio) || orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ @@ -493,7 +493,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_end(folio)); + start + len <= folio_next_pos(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); @@ -1201,7 +1201,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * finished our folio read and unlocked the folio. */ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; ret = true; @@ -1223,7 +1223,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode, * So we return true and update @next_ret to the OE/folio boundary. */ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { - u64 range_len = min(folio_end(folio), + u64 range_len = umin(folio_next_pos(folio), ordered->file_offset + ordered->num_bytes) - cur; /* @@ -2215,7 +2215,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; folio_lock(folio); @@ -2228,6 +2228,14 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_account_cgroup_owner(wbc, folio, range_len); folio_unlock(folio); } + /* + * If the fs is already in error status, do not submit any writeback + * but immediately finish it. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) { + btrfs_bio_end_io(bbio, errno_to_blk_status(BTRFS_FS_ERROR(fs_info))); + return; + } btrfs_submit_bbio(bbio, 0); } @@ -2460,10 +2468,7 @@ static int extent_write_cache_pages(struct address_space *mapping, &BTRFS_I(inode)->runtime_flags)) wbc->tagged_writepages = 1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); @@ -2619,7 +2624,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f continue; } - cur_end = min_t(u64, folio_end(folio) - 1, end); + cur_end = min_t(u64, folio_next_pos(folio) - 1, end); cur_len = cur_end + 1 - cur; ASSERT(folio_test_locked(folio)); @@ -3860,7 +3865,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); - u32 range_len = min_t(u64, folio_end(folio), + u32 range_len = min_t(u64, folio_next_pos(folio), eb->start + eb->len) - range_start; bio_add_folio_nofail(&bbio->bio, folio, range_len, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7efd1f8a1912..e7453f992e1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -89,7 +89,8 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && + folio_next_pos(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; @@ -799,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); - u64 clamp_end = min_t(u64, pos + len, folio_end(folio)); + u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; @@ -1254,8 +1255,8 @@ again: * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ - if (reserved_start + reserved_len > folio_end(folio)) { - const u64 last_block = folio_end(folio); + if (reserved_start + reserved_len > folio_next_pos(folio)) { + const u64 last_block = folio_next_pos(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, @@ -2854,12 +2855,22 @@ static int btrfs_fallocate_update_isize(struct inode *inode, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + u64 range_start; + u64 range_end; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; + range_start = round_down(i_size_read(inode), root->fs_info->sectorsize); + range_end = round_up(end, root->fs_info->sectorsize); + + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), range_start, + range_end - range_start); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..9c6ca87b3d56 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9,6 +9,7 @@ #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> @@ -177,8 +178,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, return ret; } ret = paths_from_inode(inum, ipath); - if (ret < 0) + if (ret < 0) { + btrfs_put_root(local_root); goto err; + } /* * We deliberately ignore the bit ipath might have been too small to @@ -409,7 +412,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, continue; } - index = folio_end(folio) >> PAGE_SHIFT; + index = folio_next_index(folio); /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle @@ -2336,7 +2339,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_next_pos(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); @@ -2743,7 +2747,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_end(folio) - 1; + u64 page_end = folio_next_pos(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -3884,7 +3888,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) ASSERT(ret != -ENOMEM); return ret; } else if (existing) { - WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); + WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING))); } return 0; @@ -4855,7 +4859,7 @@ again: */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = folio_end(folio); + zero_end = folio_next_pos(folio); folio_zero_range(folio, zero_start - folio_pos(folio), zero_end - zero_start); @@ -5038,7 +5042,7 @@ again: * not reach disk, it still affects our page caches. */ zero_start = max_t(u64, folio_pos(folio), start); - zero_end = min_t(u64, folio_end(folio) - 1, end); + zero_end = min_t(u64, folio_next_pos(folio) - 1, end); } else { zero_start = max_t(u64, block_start, start); zero_end = min_t(u64, block_end, end); @@ -5361,7 +5365,7 @@ static void evict_inode_truncate_pages(struct inode *inode) struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; - ASSERT(inode->i_state & I_FREEING); + ASSERT(inode_state_read_once(inode) & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); @@ -5799,7 +5803,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); @@ -5823,7 +5827,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->vfs_inode.i_state & I_NEW)) + if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW)) return inode; path = btrfs_alloc_path(); @@ -5837,6 +5841,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; unlock_new_inode(&inode->vfs_inode); return inode; } @@ -6289,8 +6295,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) } /* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. + * We need our own ->update_time so that we can return error on ENOSPC for + * updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { @@ -6788,8 +6794,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, } ret = btrfs_create_new_inode(trans, &new_inode_args); - if (!ret) + if (!ret) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; d_instantiate_new(dentry, inode); + } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -6873,7 +6882,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inode_inc_iversion(inode); inode_set_ctime_current(inode); - set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); @@ -7480,7 +7488,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; - int inode_evicting = inode->vfs_inode.i_state & I_FREEING; + int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this @@ -8709,15 +8717,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, - struct writeback_control *wbc, bool snapshot, - bool in_reclaim_context) +static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write, + bool snapshot, bool in_reclaim_context) { struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; - bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); @@ -8743,10 +8749,10 @@ static int start_delalloc_inodes(struct btrfs_root *root, if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); - if (full_flush) { - work = btrfs_alloc_delalloc_work(&inode->vfs_inode); + if (nr_to_write == NULL) { + work = btrfs_alloc_delalloc_work(tmp_inode); if (!work) { - iput(&inode->vfs_inode); + iput(tmp_inode); ret = -ENOMEM; goto out; } @@ -8754,9 +8760,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + ret = filemap_flush_nr(tmp_inode->i_mapping, + nr_to_write); btrfs_add_delayed_iput(inode); - if (ret || wbc->nr_to_write <= 0) + + if (ret || *nr_to_write <= 0) goto out; } cond_resched(); @@ -8782,29 +8790,17 @@ out: int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = LONG_MAX, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - - return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); + return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { - struct writeback_control wbc = { - .nr_to_write = nr, - .sync_mode = WB_SYNC_NONE, - .range_start = 0, - .range_end = LLONG_MAX, - }; + long *nr_to_write = nr == LONG_MAX ? NULL : &nr; struct btrfs_root *root; LIST_HEAD(splice); int ret; @@ -8816,13 +8812,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { - /* - * Reset nr_to_write here so we know that we're doing a full - * flush. - */ - if (nr == LONG_MAX) - wbc.nr_to_write = LONG_MAX; - root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); @@ -8831,9 +8820,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); + ret = start_delalloc_inodes(root, nr_to_write, false, + in_reclaim_context); btrfs_put_root(root); - if (ret < 0 || wbc.nr_to_write <= 0) + if (ret < 0 || nr <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } @@ -9169,6 +9159,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } +/* + * NOTE: in case you are adding MAY_EXEC check for directories: + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to + * elide calls here. + */ static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8cb7d5a462ef..b138120feba3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -904,14 +904,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent, struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len); int ret; - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - return ret; - - dentry = lookup_one(idmap, qname, parent); - ret = PTR_ERR(dentry); + dentry = start_creating_killable(idmap, parent, qname); if (IS_ERR(dentry)) - goto out_unlock; + return PTR_ERR(dentry); ret = btrfs_may_create(idmap, dir, dentry); if (ret) @@ -940,9 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent, out_up_read: up_read(&fs_info->subvol_sem); out_dput: - dput(dentry); -out_unlock: - btrfs_inode_unlock(BTRFS_I(dir), 0); + end_creating(dentry); return ret; } @@ -2417,18 +2410,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto free_subvol_name; } - ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (ret == -EINTR) - goto free_subvol_name; - dentry = lookup_one(idmap, &QSTR(subvol_name), parent); + dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name)); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); - goto out_unlock_dir; - } - - if (d_really_is_negative(dentry)) { - ret = -ENOENT; - goto out_dput; + goto out_end_removing; } inode = d_inode(dentry); @@ -2449,7 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) - goto out_dput; + goto out_end_removing; /* * Do not allow deletion if the parent dir is the same @@ -2460,21 +2445,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ ret = -EINVAL; if (root == dest) - goto out_dput; + goto out_end_removing; ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (ret) - goto out_dput; + goto out_end_removing; } /* check if subvolume may be deleted by a user */ ret = btrfs_may_delete(idmap, dir, dentry, 1); if (ret) - goto out_dput; + goto out_end_removing; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; - goto out_dput; + goto out_end_removing; } btrfs_inode_lock(BTRFS_I(inode), 0); @@ -2483,10 +2468,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (!ret) d_delete_notify(dir, dentry); -out_dput: - dput(dentry); -out_unlock_dir: - btrfs_inode_unlock(BTRFS_I(dir), 0); +out_end_removing: + end_removing(dentry); free_subvol_name: kfree(subvol_name_ptr); free_parent: diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 60f9b000d644..17b71e1285e5 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -209,9 +209,4 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr, return (found_set == start + nbits); } -static inline u64 folio_end(struct folio *folio) -{ - return folio_pos(folio) + folio_size(folio); -} - #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2829f20d7bb5..7fedebbee558 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (folio) { ASSERT(folio->mapping); ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_end(folio)); + ASSERT(file_offset + len <= folio_next_pos(folio)); /* * Ordered flag indicates whether we still have diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1175b8192cd7..31ad8580322a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1539,8 +1539,10 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst ASSERT(prealloc); /* Check the level of src and dst first */ - if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) { + kfree(prealloc); return -EINVAL; + } mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 651b11884f82..ba20d9286a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, &length, &bioc, NULL, NULL); if (ret < 0) { + bio_put(bio); btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); goto out; @@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_put_bioc(bioc); if (!rbio) { ret = -ENOMEM; + bio_put(bio); btrfs_bio_counter_dec(fs_info); goto out; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 5ca8d4db6722..a7ba868e9372 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -186,7 +186,8 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, * unmapped page like dummy extent buffer pages. */ if (folio->mapping) - ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio), + ASSERT(folio_pos(folio) <= start && + start + len <= folio_next_pos(folio), "start=%llu len=%u folio_pos=%llu folio_size=%zu", start, len, folio_pos(folio), folio_size(folio)); } @@ -217,7 +218,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start; + *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start; } static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 621e0df097e3..30f3c3b849c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7122,7 +7122,7 @@ log_extents: * a power failure unless the log was synced as part of an fsync * against any other unrelated inode. */ - if (inode_only != LOG_INODE_EXISTS) + if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS) inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); @@ -7910,6 +7910,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, bool log_pinned = false; int ret; + /* The inode has a new name (ref/extref), so make sure we log it. */ + set_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); + btrfs_init_log_ctx(&ctx, inode); ctx.logging_new_name = true; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..cc8aa4a04348 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2002,14 +2002,11 @@ out: static void update_dev_time(const char *device_path) { struct path path; - int ret; - - ret = kern_path(device_path, LOOKUP_FOLLOW, &path); - if (ret) - return; - inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); - path_put(&path); + if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) { + vfs_utimes(&path, NULL); + path_put(&path); + } } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, @@ -4660,12 +4657,12 @@ static int balance_kthread(void *data) struct btrfs_fs_info *fs_info = data; int ret = 0; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); mutex_unlock(&fs_info->balance_mutex); - sb_end_write(fs_info->sb); return ret; } @@ -8177,12 +8174,12 @@ static int relocating_repair_kthread(void *data) target = cache->start; btrfs_put_block_group(cache); - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { btrfs_info(fs_info, "zoned: skip relocating block group %llu to repair: EBUSY", target); - sb_end_write(fs_info->sb); return -EBUSY; } @@ -8210,7 +8207,6 @@ out: btrfs_put_block_group(cache); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0ea0df18a8e4..d1db7fa1fe58 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, if (!btrfs_dev_is_sequential(device, info->physical)) { up_read(&dev_replace->rwsem); info->alloc_offset = WP_CONVENTIONAL; + info->capacity = device->zone_info->zone_size; return 0; } @@ -1522,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1529,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, map->num_stripes); - div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > i) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == i) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if (test_bit(0, active) != test_bit(i, active)) { @@ -1574,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; + u64 stripe_nr = 0, stripe_offset = 0; + u32 stripe_index = 0; if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", @@ -1581,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } + if (last_alloc) { + u32 factor = map->num_stripes / map->sub_stripes; + + stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; + stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + } + for (int i = 0; i < map->num_stripes; i++) { if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; @@ -1594,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { - u64 stripe_nr, full_stripe_nr; - u64 stripe_offset; - int stripe_index; - - stripe_nr = div64_u64(last_alloc, map->stripe_size); - stripe_offset = stripe_nr * map->stripe_size; - full_stripe_nr = div_u64(stripe_nr, - map->num_stripes / map->sub_stripes); - div_u64_rem(stripe_nr, - (map->num_stripes / map->sub_stripes), - &stripe_index); - - zone_info[i].alloc_offset = - full_stripe_nr * map->stripe_size; + zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); if (stripe_index > (i / map->sub_stripes)) - zone_info[i].alloc_offset += map->stripe_size; + zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; else if (stripe_index == (i / map->sub_stripes)) - zone_info[i].alloc_offset += - (last_alloc - stripe_offset); + zone_info[i].alloc_offset += stripe_offset; } if ((i % map->sub_stripes) == 0) { @@ -1683,8 +1680,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { - /* Zone capacity is always zone size in emulation */ - cache->zone_capacity = cache->length; ret = calculate_alloc_pointer(cache, &last_alloc, new); if (ret) { btrfs_err(fs_info, @@ -1693,6 +1688,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } else if (map->num_stripes == num_conventional) { cache->alloc_offset = last_alloc; + cache->zone_capacity = cache->length; set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); goto out; } diff --git a/fs/buffer.c b/fs/buffer.c index 6a8752f7bbed..838c0c571022 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -611,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, return err; ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); @@ -2732,7 +2732,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, loff_t i_size = i_size_read(inode); /* Is the folio fully inside i_size? */ - if (folio_pos(folio) + folio_size(folio) <= i_size) + if (folio_next_pos(folio) <= i_size) return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 3e63cfe15874..a08250d244ea 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -9,6 +9,7 @@ #include <linux/mount.h> #include <linux/xattr.h> #include <linux/file.h> +#include <linux/namei.h> #include <linux/falloc.h> #include <trace/events/fscache.h> #include "internal.h" @@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie) if (!old_tmpfile) { struct cachefiles_volume *volume = object->volume; struct dentry *fan = volume->fanout[(u8)cookie->key_hash]; + struct dentry *obj; - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - cachefiles_bury_object(volume->cache, object, fan, - old_file->f_path.dentry, - FSCACHE_OBJECT_INVALIDATED); + obj = start_removing_dentry(fan, old_file->f_path.dentry); + if (!IS_ERR(obj)) + cachefiles_bury_object(volume->cache, object, + fan, obj, + FSCACHE_OBJECT_INVALIDATED); } fput(old_file); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d1edb2ac3837..e5ec90dccc27 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); retry: ret = cachefiles_inject_read_error(); if (ret == 0) - subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir); + subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname)); else subdir = ERR_PTR(ret); trace_cachefiles_lookup(NULL, dir, subdir); @@ -129,10 +128,12 @@ retry: if (ret < 0) goto mkdir_error; ret = cachefiles_inject_write_error(); - if (ret == 0) - subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - else + if (ret == 0) { + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL); + } else { + end_creating(subdir); subdir = ERR_PTR(ret); + } if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); @@ -141,7 +142,7 @@ retry: trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { - dput(subdir); + end_creating(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -154,7 +155,7 @@ retry: /* Tell rmdir() it's not allowed to delete the subdir */ inode_lock(d_inode(subdir)); - inode_unlock(d_inode(dir)); + end_creating_keep(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", @@ -196,14 +197,11 @@ mark_error: return ERR_PTR(-EBUSY); mkdir_error: - inode_unlock(d_inode(dir)); - if (!IS_ERR(subdir)) - dput(subdir); + end_creating(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); @@ -263,6 +261,8 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, * - File backed objects are unlinked * - Directory backed objects are stuffed into the graveyard for userspace to * delete + * On entry dir must be locked. It will be unlocked on exit. + * On entry there must be at least 2 refs on rep, one will be dropped on exit. */ int cachefiles_bury_object(struct cachefiles_cache *cache, struct cachefiles_object *object, @@ -278,27 +278,23 @@ int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); if (rep->d_parent != dir) { - inode_unlock(d_inode(dir)); + end_removing(rep); _leave(" = -ESTALE"); return -ESTALE; } /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { - dget(rep); /* Stop the dentry being negated if it's only pinned - * by a file struct. - */ ret = cachefiles_unlink(cache, object, dir, rep, why); - dput(rep); + end_removing(rep); - inode_unlock(d_inode(dir)); _leave(" = %d", ret); return ret; } /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - inode_unlock(d_inode(dir)); + end_removing(rep); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -425,13 +421,12 @@ int cachefiles_delete_object(struct cachefiles_object *object, _enter(",OBJ%x{%pD}", object->debug_id, object->file); - /* Stop the dentry being negated if it's only pinned by a file struct. */ - dget(dentry); - - inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); - inode_unlock(d_backing_inode(fan)); - dput(dentry); + dentry = start_removing_dentry(fan, dentry); + if (IS_ERR(dentry)) + ret = PTR_ERR(dentry); + else + ret = cachefiles_unlink(volume->cache, object, fan, dentry, why); + end_removing(dentry); return ret; } @@ -644,9 +639,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) if (!d_is_reg(dentry)) { pr_err("%pd is not a file\n", dentry); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); - ret = cachefiles_bury_object(volume->cache, object, fan, dentry, - FSCACHE_OBJECT_IS_WEIRD); + struct dentry *de = start_removing_dentry(fan, dentry); + if (IS_ERR(de)) + ret = PTR_ERR(de); + else + ret = cachefiles_bury_object(volume->cache, object, + fan, de, + FSCACHE_OBJECT_IS_WEIRD); dput(dentry); if (ret < 0) return false; @@ -679,36 +678,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, _enter(",%pD", object->file); - inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } - if (!d_is_negative(dentry)) { + /* + * This loop will only execute more than once if some other thread + * races to create the object we are trying to create. + */ + while (!d_is_negative(dentry)) { ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) - goto out_dput; + goto out_end; + + end_creating(dentry); - dput(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); + dentry = start_creating(&nop_mnt_idmap, fan, + &QSTR(object->d_name)); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry), cachefiles_trace_lookup_error); _debug("lookup fail %ld", PTR_ERR(dentry)); - goto out_unlock; + goto out; } } @@ -729,10 +733,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, success = true; } -out_dput: - dput(dentry); -out_unlock: - inode_unlock(d_inode(fan)); +out_end: + end_creating(dentry); +out: _leave(" = %u", success); return success; } @@ -748,26 +751,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, struct dentry *victim; int ret = -ENOENT; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); + victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename)); - victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir); if (IS_ERR(victim)) goto lookup_error; - if (d_is_negative(victim)) - goto lookup_put; if (d_inode(victim)->i_flags & S_KERNEL_FILE) goto lookup_busy; return victim; lookup_busy: ret = -EBUSY; -lookup_put: - inode_unlock(d_inode(dir)); - dput(victim); + end_removing(victim); return ERR_PTR(ret); lookup_error: - inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */ @@ -815,18 +812,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, ret = cachefiles_bury_object(cache, NULL, dir, victim, FSCACHE_OBJECT_WAS_CULLED); + dput(victim); if (ret < 0) goto error; fscache_count_culled(); - dput(victim); _leave(" = 0"); return 0; error_unlock: - inode_unlock(d_inode(dir)); + end_removing(victim); error: - dput(victim); if (ret == -ENOENT) return -ESTALE; /* Probably got retired by the netfs */ diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index 781aac4ef274..90ba926f488e 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/slab.h> +#include <linux/namei.h> #include "internal.h" #include <trace/events/fscache.h> @@ -58,9 +59,11 @@ retry: if (ret < 0) { if (ret != -ESTALE) goto error_dir; - inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT); - cachefiles_bury_object(cache, NULL, cache->store, vdentry, - FSCACHE_VOLUME_IS_WEIRD); + vdentry = start_removing_dentry(cache->store, vdentry); + if (!IS_ERR(vdentry)) + cachefiles_bury_object(cache, NULL, cache->store, + vdentry, + FSCACHE_VOLUME_IS_WEIRD); cachefiles_put_directory(volume->dentry); cond_resched(); goto retry; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 322ed268f14a..63b75d214210 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping, ceph_wbc->index = ceph_wbc->start_index; ceph_wbc->end = -1; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; - } else { - ceph_wbc->tag = PAGECACHE_TAG_DIRTY; - } + ceph_wbc->tag = wbc_to_tag(wbc); ceph_wbc->op_idx = -1; ceph_wbc->num_ops = 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 930fbd54d2c8..f678bab189d8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) return; /* Only new inodes! */ - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return; WARN_ON_ONCE(ci->netfs.cache); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index 7026e794813c..928746b92512 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -329,7 +329,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) out: kfree(cryptbuf); if (dir != parent) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); @@ -438,7 +438,7 @@ out: fscrypt_fname_free_buffer(&_tname); out_inode: if (dir != fname->dir) { - if ((dir->i_state & I_NEW)) + if ((inode_state_read_once(dir) & I_NEW)) discard_new_inode(dir); else iput(dir); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 99b30f784ee2..983390069f73 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -740,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode, vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { /* * If it's not I_NEW, then someone created this before * we got here. Assume the server is aware of it at @@ -901,7 +901,7 @@ retry: new_inode = NULL; goto out_req; } - WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW)); spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a6e260d9e420..37d3a2477c17 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -132,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, goto out_err; } - inode->i_state = 0; + inode_state_assign_raw(inode, 0); inode->i_mode = *mode; err = ceph_security_init_secctx(dentry, *mode, as_ctx); @@ -201,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, doutc(cl, "on %llx=%llx.%llx got %p new %d\n", ceph_present_inode(inode), ceph_vinop(inode), inode, - !!(inode->i_state & I_NEW)); + !!(inode_state_read_once(inode) & I_NEW)); return inode; } @@ -228,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) goto err; } - if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", inode->i_mode); goto err; @@ -261,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) } } #endif - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ @@ -270,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) return inode; err: - if ((inode->i_state & I_NEW)) + if ((inode_state_read_once(inode) & I_NEW)) discard_new_inode(inode); else iput(inode); @@ -744,7 +744,7 @@ void ceph_evict_inode(struct inode *inode) netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); clear_inode(inode); @@ -1013,7 +1013,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, le64_to_cpu(info->version), ci->i_version); /* Once I_NEW is cleared, we can't change type or dev numbers */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_mode = mode; } else { if (inode_wrong_type(inode, mode)) { @@ -1090,7 +1090,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, #ifdef CONFIG_FS_ENCRYPTION if (iinfo->fscrypt_auth_len && - ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) { kfree(ci->fscrypt_auth); ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; ci->fscrypt_auth = iinfo->fscrypt_auth; @@ -1692,13 +1692,13 @@ retry_lookup: pr_err_client(cl, "badness %p %llx.%llx\n", in, ceph_vinop(in)); req->r_target_inode = NULL; - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) discard_new_inode(in); else iput(in); goto done; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); } @@ -1898,11 +1898,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, pr_err_client(cl, "inode badness on %p got %d\n", in, rc); err = rc; - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } - } else if (in->i_state & I_NEW) { + } else if (inode_state_read_once(in) & I_NEW) { unlock_new_inode(in); } @@ -2114,7 +2114,7 @@ retry_lookup: pr_err_client(cl, "badness on %p %llx.%llx\n", in, ceph_vinop(in)); if (d_really_is_negative(dn)) { - if (in->i_state & I_NEW) { + if (inode_state_read_once(in) & I_NEW) { ihold(in); discard_new_inode(in); } @@ -2124,7 +2124,7 @@ retry_lookup: err = ret; goto next_item; } - if (in->i_state & I_NEW) + if (inode_state_read_once(in) & I_NEW) unlock_new_inode(in); if (d_really_is_negative(dn)) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index ad0cf177e75a..f6bf24b5c683 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path = fsc->mount_options->server_path ? fsc->mount_options->server_path + 1 : ""; - err = __ceph_open_session(fsc->client, started); + err = __ceph_open_session(fsc->client); if (err < 0) goto out; diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 62a3d2565c26..70bb0579b40c 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -70,7 +70,7 @@ retry: if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cii = ITOC(inode); /* we still need to set i_ino for things like stat(2) */ inode->i_ino = hash; @@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb) /* we should never see newly created inodes because we intentionally * fail in the initialization callback */ - BUG_ON(inode->i_state & I_NEW); + BUG_ON(inode_state_read_once(inode) & I_NEW); return inode; } diff --git a/fs/coredump.c b/fs/coredump.c index 5c1c381ee380..fe4099e0530b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1036,7 +1036,7 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm, static bool coredump_write(struct core_name *cn, struct coredump_params *cprm, - struct linux_binfmt *binfmt) + const struct linux_binfmt *binfmt) { if (dump_interrupted()) @@ -1086,119 +1086,119 @@ static inline bool coredump_skip(const struct coredump_params *cprm, return false; } -void vfs_coredump(const kernel_siginfo_t *siginfo) +static void do_coredump(struct core_name *cn, struct coredump_params *cprm, + size_t **argv, int *argc, const struct linux_binfmt *binfmt) { - struct cred *cred __free(put_cred) = NULL; - size_t *argv __free(kfree) = NULL; - struct core_state core_state; - struct core_name cn; - struct mm_struct *mm = current->mm; - struct linux_binfmt *binfmt = mm->binfmt; - const struct cred *old_cred; - int argc = 0; - struct coredump_params cprm = { - .siginfo = siginfo, - .limit = rlimit(RLIMIT_CORE), - /* - * We must use the same mm->flags while dumping core to avoid - * inconsistency of bit flags, since this flag is not protected - * by any locks. - * - * Note that we only care about MMF_DUMP* flags. - */ - .mm_flags = __mm_flags_get_dumpable(mm), - .vma_meta = NULL, - .cpu = raw_smp_processor_id(), - }; - - audit_core_dumps(siginfo->si_signo); - - if (coredump_skip(&cprm, binfmt)) - return; - - cred = prepare_creds(); - if (!cred) - return; - /* - * We cannot trust fsuid as being the "true" uid of the process - * nor do we know its entire history. We only know it was tainted - * so we dump it as root in mode 2, and only into a controlled - * environment (pipe handler or fully qualified path). - */ - if (coredump_force_suid_safe(&cprm)) - cred->fsuid = GLOBAL_ROOT_UID; - - if (coredump_wait(siginfo->si_signo, &core_state) < 0) - return; - - old_cred = override_creds(cred); - - if (!coredump_parse(&cn, &cprm, &argv, &argc)) { + if (!coredump_parse(cn, cprm, argv, argc)) { coredump_report_failure("format_corename failed, aborting core"); - goto close_fail; + return; } - switch (cn.core_type) { + switch (cn->core_type) { case COREDUMP_FILE: - if (!coredump_file(&cn, &cprm, binfmt)) - goto close_fail; + if (!coredump_file(cn, cprm, binfmt)) + return; break; case COREDUMP_PIPE: - if (!coredump_pipe(&cn, &cprm, argv, argc)) - goto close_fail; + if (!coredump_pipe(cn, cprm, *argv, *argc)) + return; break; case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: - if (!coredump_socket(&cn, &cprm)) - goto close_fail; + if (!coredump_socket(cn, cprm)) + return; break; default: WARN_ON_ONCE(true); - goto close_fail; + return; } /* Don't even generate the coredump. */ - if (cn.mask & COREDUMP_REJECT) - goto close_fail; + if (cn->mask & COREDUMP_REJECT) + return; /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ if (unshare_files()) - goto close_fail; + return; - if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt)) - goto close_fail; + if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt)) + return; - coredump_sock_shutdown(cprm.file); + coredump_sock_shutdown(cprm->file); /* Let the parent know that a coredump was generated. */ - if (cn.mask & COREDUMP_USERSPACE) - cn.core_dumped = true; + if (cn->mask & COREDUMP_USERSPACE) + cn->core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server * or usermodehelper to finish before exiting so it can e.g., * inspect /proc/<pid>. */ - if (cn.mask & COREDUMP_WAIT) { - switch (cn.core_type) { + if (cn->mask & COREDUMP_WAIT) { + switch (cn->core_type) { case COREDUMP_PIPE: - wait_for_dump_helpers(cprm.file); + wait_for_dump_helpers(cprm->file); break; case COREDUMP_SOCK_REQ: fallthrough; case COREDUMP_SOCK: - coredump_sock_wait(cprm.file); + coredump_sock_wait(cprm->file); break; default: break; } } +} + +void vfs_coredump(const kernel_siginfo_t *siginfo) +{ + size_t *argv __free(kfree) = NULL; + struct core_state core_state; + struct core_name cn; + const struct mm_struct *mm = current->mm; + const struct linux_binfmt *binfmt = mm->binfmt; + int argc = 0; + struct coredump_params cprm = { + .siginfo = siginfo, + .limit = rlimit(RLIMIT_CORE), + /* + * We must use the same mm->flags while dumping core to avoid + * inconsistency of bit flags, since this flag is not protected + * by any locks. + * + * Note that we only care about MMF_DUMP* flags. + */ + .mm_flags = __mm_flags_get_dumpable(mm), + .vma_meta = NULL, + .cpu = raw_smp_processor_id(), + }; + + audit_core_dumps(siginfo->si_signo); + + if (coredump_skip(&cprm, binfmt)) + return; + + CLASS(prepare_creds, cred)(); + if (!cred) + return; + /* + * We cannot trust fsuid as being the "true" uid of the process + * nor do we know its entire history. We only know it was tainted + * so we dump it as root in mode 2, and only into a controlled + * environment (pipe handler or fully qualified path). + */ + if (coredump_force_suid_safe(&cprm)) + cred->fsuid = GLOBAL_ROOT_UID; + + if (coredump_wait(siginfo->si_signo, &core_state) < 0) + return; -close_fail: + scoped_with_creds(cred) + do_coredump(&cn, &cprm, &argv, &argc, binfmt); coredump_cleanup(&cn, &cprm); - revert_creds(old_cred); return; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ca54bf24b719..e54ebe402df7 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -95,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; switch (cramfs_inode->mode & S_IFMT) { diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 5dee7c498bc8..ed6e926226b5 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -333,8 +333,7 @@ static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, inode = mapping->host; *inode_ret = inode; - *lblk_num_ret = ((u64)folio->index << (PAGE_SHIFT - inode->i_blkbits)) + - (bh_offset(bh) >> inode->i_blkbits); + *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits; return true; } diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 3adbd7167055..5e939ea3ac28 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -945,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4bd3918f50e3..40fa05688d3a 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -834,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode) * userspace is still using the files, inodes can be dirtied between * then and now. We mustn't lose any writes, so skip dirty inodes here. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) return 0; /* @@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); /* * invalidate the pages whose sharing state is to be changed @@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (ret < 0) return ret; - ret = iomap_iter_advance(iter, &length); + ret = iomap_iter_advance(iter, length); if (ret) return ret; - } while (length > 0); + } while ((length = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; @@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { done = iov_iter_zero(min(length, end - pos), iter); - return iomap_iter_advance(iomi, &done); + return iomap_iter_advance(iomi, done); } } @@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - length = xfer; - ret = iomap_iter_advance(iomi, &length); + ret = iomap_iter_advance(iomi, xfer); if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; + length = iomap_length(iomi); } dax_read_unlock(id); @@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) { - u64 length = PAGE_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (!(ret & VM_FAULT_ERROR)) + iter.status = iomap_iter_advance(&iter, PAGE_SIZE); } if (iomap_errp) @@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) { - u64 length = PMD_SIZE; - iter.status = iomap_iter_advance(&iter, &length); - } + if (ret != VM_FAULT_FALLBACK) + iter.status = iomap_iter_advance(&iter, PMD_SIZE); } unlock_entry: @@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; - u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src, dax_read_unlock(id); advance: - dest_len = len; - ret = iomap_iter_advance(it_src, &len); + ret = iomap_iter_advance(it_src, len); if (!ret) - ret = iomap_iter_advance(it_dest, &dest_len); + ret = iomap_iter_advance(it_dest, len); return ret; out_unlock: diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..9143fd502def 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __ro_after_init; +static struct kmem_cache *__dentry_cache __ro_after_init; +#define dentry_cache runtime_const_ptr(__dentry_cache) const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -794,7 +795,7 @@ void d_mark_dontcache(struct inode *inode) de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } - inode->i_state |= I_DONTCACHE; + inode_state_set(inode, I_DONTCACHE); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); @@ -1073,7 +1074,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode) spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left - if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { + if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { @@ -1980,14 +1981,8 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -2306,11 +2301,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) - continue; if (dentry->d_name.hash_len != hashlen) continue; - if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)) + continue; + /* + * Check for the dentry being unhashed. + * + * As tempting as it is, we *can't* skip it because of a race window + * between us finding the dentry before it gets unhashed and loading + * the sequence counter after unhashing is finished. + * + * We can at least predict on it. + */ + if (unlikely(d_unhashed(dentry))) continue; *seqp = seq; return dentry; @@ -3222,9 +3226,10 @@ static void __init dcache_init(void) * but it is probably not worth it because of the cache nature * of the dcache. */ - dentry_cache = KMEM_CACHE_USERCOPY(dentry, + __dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); + runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 661a99a7dfbe..532bd7c46baf 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -403,7 +403,7 @@ static struct dentry *debugfs_start_creating(const char *name, return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +static struct dentry *debugfs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); @@ -411,7 +411,7 @@ static struct dentry *failed_creating(struct dentry *dentry) return ERR_PTR(-ENOMEM); } -static struct dentry *end_creating(struct dentry *dentry) +static struct dentry *debugfs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; @@ -435,7 +435,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -443,7 +443,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = mode; @@ -458,7 +458,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } struct dentry *debugfs_create_file_full(const char *name, umode_t mode, @@ -585,7 +585,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -593,7 +593,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; @@ -605,7 +605,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -632,7 +632,7 @@ struct dentry *debugfs_create_automount(const char *name, return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - failed_creating(dentry); + debugfs_failed_creating(dentry); return ERR_PTR(-EPERM); } @@ -640,7 +640,7 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } make_empty_dir_inode(inode); @@ -652,7 +652,7 @@ struct dentry *debugfs_create_automount(const char *name, d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); @@ -699,13 +699,13 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); - return failed_creating(dentry); + return debugfs_failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); - return end_creating(dentry); + return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); @@ -842,7 +842,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . int error = 0; const char *new_name; struct name_snapshot old_name; - struct dentry *parent, *target; + struct dentry *target; + struct renamedata rd = {}; struct inode *dir; va_list ap; @@ -855,36 +856,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . if (!new_name) return -ENOMEM; - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock(dir); + rd.old_parent = dget_parent(dentry); + rd.new_parent = rd.old_parent; + rd.flags = RENAME_NOREPLACE; + target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent); + if (IS_ERR(target)) + return PTR_ERR(target); - take_dentry_name_snapshot(&old_name, dentry); - - if (WARN_ON_ONCE(dentry->d_parent != parent)) { - error = -EINVAL; - goto out; - } - if (strcmp(old_name.name.name, new_name) == 0) - goto out; - target = lookup_noperm(&QSTR(new_name), parent); - if (IS_ERR(target)) { - error = PTR_ERR(target); - goto out; - } - if (d_really_is_positive(target)) { - dput(target); - error = -EINVAL; + error = start_renaming_two_dentries(&rd, dentry, target); + if (error) { + if (error == -EEXIST && target == dentry) + /* it isn't an error to rename a thing to itself */ + error = 0; goto out; } - simple_rename_timestamp(dir, dentry, dir, target); - d_move(dentry, target); - dput(target); + + dir = d_inode(rd.old_parent); + take_dentry_name_snapshot(&old_name, dentry); + simple_rename_timestamp(dir, dentry, dir, rd.new_dentry); + d_move(dentry, rd.new_dentry); fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry); -out: release_dentry_name_snapshot(&old_name); - inode_unlock(dir); - dput(parent); + end_renaming(&rd); +out: + dput(rd.old_parent); + dput(target); kfree_const(new_name); return error; } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 019a8b4eaaf9..49f56a598ecb 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * inodes without pages but we deliberately won't in case * we need to reschedule to avoid softlockups. */ - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1bdeaa6d5790..c2f4fb41b4e6 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -4,7 +4,7 @@ config ECRYPT_FS depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC - select CRYPTO_MD5 + select CRYPTO_LIB_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.rst> to learn more about diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 69536cacdea8..260f8a4938b0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -9,7 +9,6 @@ * Michael C. Thompson <mcthomps@us.ibm.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/fs.h> #include <linux/mount.h> @@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -/** - * ecryptfs_calculate_md5 - calculates the md5 of @src - * @dst: Pointer to 16 bytes of allocated memory - * @crypt_stat: Pointer to crypt_stat struct for the current inode - * @src: Data to be md5'd - * @len: Length of @src - * - * Uses the allocated crypto context that crypt_stat references to - * generate the MD5 sum of the contents of src. - */ -static int ecryptfs_calculate_md5(char *dst, - struct ecryptfs_crypt_stat *crypt_stat, - char *src, int len) -{ - int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out; - } -out: - return rc; -} - static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, char *cipher_name, char *chaining_modifier) @@ -104,13 +77,10 @@ out: * * Generate the initialization vector from the given root IV and page * offset. - * - * Returns zero on success; non-zero on error. */ -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; char src[ECRYPTFS_MAX_IV_BYTES + 16]; @@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, ecryptfs_printk(KERN_DEBUG, "source:\n"); ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); } - rc = ecryptfs_calculate_md5(dst, crypt_stat, src, - (crypt_stat->iv_bytes + 16)); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating IV for a page\n"); - goto out; - } + md5(src, crypt_stat->iv_bytes + 16, dst); memcpy(iv, dst, crypt_stat->iv_bytes); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); } -out: - return rc; } /** @@ -151,29 +113,14 @@ out: * * Initialize the crypt_stat structure. */ -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { - struct crypto_shash *tfm; - int rc; - - tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - ecryptfs_printk(KERN_ERR, "Error attempting to " - "allocate crypto context; rc = [%d]\n", - rc); - return rc; - } - memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); INIT_LIST_HEAD(&crypt_stat->keysig_list); mutex_init(&crypt_stat->keysig_list_mutex); mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); - crypt_stat->hash_tfm = tfm; crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; - - return 0; } /** @@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) struct ecryptfs_key_sig *key_sig, *key_sig_tmp; crypto_free_skcipher(crypt_stat->tfm); - crypto_free_shash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); @@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, int rc; extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size)); - rc = ecryptfs_derive_iv(extent_iv, crypt_stat, - (extent_base + extent_offset)); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " - "extent [0x%.16llx]; rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - goto out; - } + ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset); sg_init_table(&src_sg, 1); sg_init_table(&dst_sg, 1); @@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) */ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); BUG_ON(crypt_stat->iv_bytes <= 0); if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { - rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Session key not valid; " "cannot generate root IV\n"); - goto out; - } - rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, - crypt_stat->key_size); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating root IV\n"); - goto out; - } - memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); -out: - if (rc) { memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + return -EINVAL; } - return rc; + md5(crypt_stat->key, crypt_stat->key_size, dst); + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); + return 0; } static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 9e6ab0b41337..62a2ea7f59ed 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -14,6 +14,7 @@ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H +#include <crypto/md5.h> #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> @@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key) + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_HASH "md5" -#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key) * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ -#define MD5_DIGEST_SIZE 16 -#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE #define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ @@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat { unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct crypto_skcipher *tfm; - struct crypto_shash *hash_tfm; /* Crypto context for generating - * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; @@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); @@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size); int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat); -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset); +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..3978248247dc 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -24,18 +24,26 @@ #include <linux/unaligned.h> #include "ecryptfs_kernel.h" -static int lock_parent(struct dentry *dentry, - struct dentry **lower_dentry, - struct inode **lower_dir) +static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry) { - struct dentry *lower_dir_dentry; + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - *lower_dir = d_inode(lower_dir_dentry); - *lower_dentry = ecryptfs_dentry_to_lower(dentry); + ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; +} - inode_lock_nested(*lower_dir, I_MUTEX_PARENT); - return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL; +static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry) +{ + struct dentry *parent = dget_parent(dentry); + struct dentry *ret; + + ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent), + ecryptfs_dentry_to_lower(dentry)); + dput(parent); + return ret; } static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) @@ -95,7 +103,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode, iput(lower_inode); return ERR_PTR(-EACCES); } - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) iput(lower_inode); return inode; @@ -106,7 +114,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, { struct inode *inode = __ecryptfs_get_inode(lower_inode, sb); - if (!IS_ERR(inode) && (inode->i_state & I_NEW)) + if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; @@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, - NULL); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -186,10 +190,11 @@ ecryptfs_do_create(struct inode *directory_inode, struct inode *lower_dir; struct inode *inode; - rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_create(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, true); + lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry); + if (IS_ERR(lower_dentry)) + return ERR_CAST(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -205,7 +210,7 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_attr_times(directory_inode, lower_dir); fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry); return inode; } @@ -364,7 +369,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, } } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); return d_splice_alias(inode, dentry); } @@ -433,10 +438,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); - rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); - if (!rc) - rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, - lower_new_dentry, NULL); + lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry); + if (IS_ERR(lower_new_dentry)) + return PTR_ERR(lower_new_dentry); + lower_dir = lower_new_dentry->d_parent->d_inode; + rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir, + lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -448,7 +455,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - inode_unlock(lower_dir); + end_creating(lower_new_dentry); return rc; } @@ -468,9 +475,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, size_t encoded_symlen; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out_lock; + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + mount_crypt_stat = &ecryptfs_superblock_to_private( dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, @@ -480,7 +489,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, if (rc) goto out_lock; rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry, - encoded_symname); + encoded_symname, NULL); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) goto out_lock; @@ -490,7 +499,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out_lock: - inode_unlock(lower_dir); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -501,14 +510,16 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, { int rc; struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (rc) - goto out; - + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return lower_dentry; + lower_dir_dentry = dget(lower_dentry->d_parent); + lower_dir = lower_dir_dentry->d_inode; lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); + lower_dentry, mode, NULL); rc = PTR_ERR(lower_dentry); if (IS_ERR(lower_dentry)) goto out; @@ -522,7 +533,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_inode_size(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); out: - inode_unlock(lower_dir); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return ERR_PTR(rc); @@ -534,21 +545,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) struct inode *lower_dir; int rc; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - dget(lower_dentry); // don't even try to make the lower negative - if (!rc) { - if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry); - } + lower_dentry = ecryptfs_start_removing_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL); if (!rc) { clear_nlink(d_inode(dentry)); fsstack_copy_attr_times(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); } - dput(lower_dentry); - inode_unlock(lower_dir); + end_removing(lower_dentry); if (!rc) d_drop(dentry); return rc; @@ -562,10 +570,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *lower_dentry; struct inode *lower_dir; - rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mknod(&nop_mnt_idmap, lower_dir, - lower_dentry, mode, dev); + lower_dentry = ecryptfs_start_creating_dentry(dentry); + if (IS_ERR(lower_dentry)) + return PTR_ERR(lower_dentry); + lower_dir = lower_dentry->d_parent->d_inode; + + rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev, NULL); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -574,7 +584,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out: - inode_unlock(lower_dir); + end_removing(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -590,7 +600,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; - struct dentry *trap; struct inode *target_inode; struct renamedata rd = {}; @@ -605,31 +614,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, target_inode = d_inode(new_dentry); - trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); - if (IS_ERR(trap)) - return PTR_ERR(trap); - dget(lower_new_dentry); - rc = -EINVAL; - if (lower_old_dentry->d_parent != lower_old_dir_dentry) - goto out_lock; - if (lower_new_dentry->d_parent != lower_new_dir_dentry) - goto out_lock; - if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry)) - goto out_lock; - /* source should not be ancestor of target */ - if (trap == lower_old_dentry) - goto out_lock; - /* target should not be ancestor of source */ - if (trap == lower_new_dentry) { - rc = -ENOTEMPTY; - goto out_lock; - } + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = lower_old_dir_dentry; + rd.new_parent = lower_new_dir_dentry; + rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry); + if (rc) + return rc; - rd.mnt_idmap = &nop_mnt_idmap; - rd.old_parent = lower_old_dir_dentry; - rd.old_dentry = lower_old_dentry; - rd.new_parent = lower_new_dir_dentry; - rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); if (rc) goto out_lock; @@ -640,8 +631,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new_dir != old_dir) fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: - dput(lower_new_dentry); - unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + end_renaming(&rd); return rc; } @@ -903,11 +893,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap, struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) { - rc = ecryptfs_init_crypt_stat(crypt_stat); - if (rc) - return rc; - } + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7f9f68c00ef6..bbf8603242fa 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -11,7 +11,6 @@ * Trevor S. Highland <trevor.highland@gmail.com> */ -#include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/string.h> #include <linux/pagemap.h> @@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct crypto_skcipher *skcipher_tfm; struct skcipher_request *skcipher_req; char iv[ECRYPTFS_MAX_IV_BYTES]; - char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - struct crypto_shash *hash_tfm; - struct shash_desc *hash_desc; + char hash[MD5_DIGEST_SIZE]; }; /* @@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "password tokens\n", __func__); goto out_free_unlock; } - s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0); - if (IS_ERR(s->hash_tfm)) { - rc = PTR_ERR(s->hash_tfm); - printk(KERN_ERR "%s: Error attempting to " - "allocate hash crypto context; rc = [%d]\n", - __func__, rc); - goto out_free_unlock; - } - - s->hash_desc = kmalloc(sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); - if (!s->hash_desc) { - rc = -ENOMEM; - goto out_release_free_unlock; - } - s->hash_desc->tfm = s->hash_tfm; - - rc = crypto_shash_digest(s->hash_desc, - (u8 *)s->auth_tok->token.password.session_key_encryption_key, - s->auth_tok->token.password.session_key_encryption_key_bytes, - s->hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out_release_free_unlock; - } + md5(s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes, + s->hash); for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { s->block_aligned_filename[s->j] = - s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; - if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) - == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { - rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash, - ECRYPTFS_TAG_70_DIGEST_SIZE, - s->tmp_hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - memcpy(s->hash, s->tmp_hash, - ECRYPTFS_TAG_70_DIGEST_SIZE); - } + s->hash[s->j % MD5_DIGEST_SIZE]; + if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1)) + md5(s->hash, MD5_DIGEST_SIZE, s->hash); if (s->block_aligned_filename[s->j] == '\0') s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; } @@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert filename memory to scatterlist; rc = [%d]. " "block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, s->dst_sg, 2); @@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert encrypted filename memory to scatterlist; " "rc = [%d]. block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } /* The characters in the first block effectively do the job * of the IV here, so we just use 0's for the IV. Note the @@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); - goto out_release_free_unlock; + goto out_free_unlock; } skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, s->block_aligned_filename_size, s->iv); @@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; + goto out_free_unlock; } s->i += s->block_aligned_filename_size; (*packet_size) = s->i; (*remaining_bytes) -= (*packet_size); -out_release_free_unlock: - crypto_free_shash(s->hash_tfm); out_free_unlock: kfree_sensitive(s->block_aligned_filename); out_unlock: @@ -850,7 +808,6 @@ out: key_put(auth_tok_key); } skcipher_request_free(s->skcipher_req); - kfree_sensitive(s->hash_desc); kfree(s); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 16ea14dd2c62..c12dc680f8fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -12,6 +12,7 @@ #include <linux/dcache.h> #include <linux/file.h> +#include <linux/fips.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> @@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out; } + if (fips_enabled) { + rc = -EINVAL; + err = "eCryptfs support is disabled due to FIPS"; + goto out; + } + s = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(s)) { rc = PTR_ERR(s); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e7b7f426fecf..3bc21d677564 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; - if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); - goto out; - } + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); mutex_init(&inode_info->lower_file_mutex); atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1f4d8ce56667..6de97565d5f7 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = { .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, .parameters = efivarfs_parameters, + .fs_flags = FS_POWER_FREEZE, }; static __init int efivarfs_init(void) diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 462619e59766..28407578f83a 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode = iget_locked(super, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; in = INODE_INFO(inode); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8ca29962a3dd..bb13c4cb8455 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio) { trace_erofs_read_folio(folio, true); - return iomap_read_folio(folio, &erofs_iomap_ops); + iomap_bio_read_folio(folio, &erofs_iomap_ops); + return 0; } static void erofs_readahead(struct readahead_control *rac) @@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac) trace_erofs_readahead(rac->mapping->host, readahead_index(rac), readahead_count(rac), true); - return iomap_readahead(rac, &erofs_iomap_ops); + iomap_bio_readahead(rac, &erofs_iomap_ops); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index b4bfe14229f9..e38d93bb2104 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -172,7 +172,6 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, dctx.bounce = strm->bounce; do { - dctx.avail_out = out_buf.size - out_buf.pos; dctx.inbuf_sz = in_buf.size; dctx.inbuf_pos = in_buf.pos; err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst, @@ -188,14 +187,18 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, in_buf.pos = dctx.inbuf_pos; zerr = zstd_decompress_stream(stream, &out_buf, &in_buf); - if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) { + dctx.avail_out = out_buf.size - out_buf.pos; + if (zstd_is_error(zerr) || + ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 && + !(rq->inputsize + in_buf.size - in_buf.pos))))) { erofs_err(sb, "failed to decompress in[%u] out[%u]: %s", rq->inputsize, rq->outputsize, - zerr ? zstd_get_error_name(zerr) : "unexpected end of stream"); + zstd_is_error(zerr) ? zstd_get_error_name(zerr) : + "unexpected end of stream"); err = -EFSCORRUPTED; break; } - } while (rq->outputsize || out_buf.pos < out_buf.size); + } while (rq->outputsize + dctx.avail_out); if (dctx.kout) kunmap_local(dctx.kout); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index b7b3432a9882..d27938435b2f 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -47,7 +47,6 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { - const struct cred *old_cred; struct iov_iter iter; int ret; @@ -61,9 +60,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); - old_cred = override_creds(rq->iocb.ki_filp->f_cred); - ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); - revert_creds(old_cred); + scoped_with_creds(rq->iocb.ki_filp->f_cred) + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); } diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index cb780c095d28..bce98c845a18 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -295,7 +295,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { int err = erofs_fill_inode(inode); if (err) { diff --git a/fs/eventfd.c b/fs/eventfd.c index af42b2c7d235..3219e0d596fe 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -378,9 +378,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { - struct eventfd_ctx *ctx; - struct file *file; - int fd; + struct eventfd_ctx *ctx __free(kfree) = NULL; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); @@ -398,26 +396,19 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; - fd = get_unused_fd_flags(flags); - if (fd < 0) - goto err; - - file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, - ctx, flags, FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err; - } - fd_install(fd, file); - return fd; -err: - eventfd_free_ctx(ctx); - return fd; + + FD_PREPARE(fdf, flags, + anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, ctx, + flags, FMODE_NOWAIT)); + if (fdf.err) + return fdf.err; + + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); + retain_and_null_ptr(ctx); + return fd_publish(fdf); } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ee7c4b683ec3..6c36d9dc6926 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2165,9 +2165,8 @@ static void clear_tfile_check_list(void) */ static int do_epoll_create(int flags) { - int error, fd; - struct eventpoll *ep = NULL; - struct file *file; + int error; + struct eventpoll *ep; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -2184,26 +2183,15 @@ static int do_epoll_create(int flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); - if (fd < 0) { - error = fd; - goto out_free_ep; - } - file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, - O_RDWR | (flags & O_CLOEXEC)); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto out_free_fd; + FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC), + anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, + O_RDWR | (flags & O_CLOEXEC))); + if (fdf.err) { + ep_clear_and_put(ep); + return fdf.err; } - ep->file = file; - fd_install(fd, file); - return fd; - -out_free_fd: - put_unused_fd(fd); -out_free_ep: - ep_clear_and_put(ep); - return error; + ep->file = fd_prepare_file(fdf); + return fd_publish(fdf); } SYSCALL_DEFINE1(epoll_create1, int, flags) diff --git a/fs/exec.c b/fs/exec.c index 4298e7e08d5d..9d5ebc9d15b0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1280,10 +1280,9 @@ int begin_new_exec(struct linux_binprm * bprm) /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { - retval = get_unused_fd_flags(0); + retval = FD_ADD(0, bprm->executable); if (retval < 0) goto out_unlock; - fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } @@ -1775,7 +1774,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - rseq_set_notify_resume(current); + rseq_force_update(); current->in_execve = 0; return retval; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 7f9592856bf7..74d451f732c7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ - sb_min_blocksize(sb, 512); + if (!sb_min_blocksize(sb, 512)) { + exfat_err(sb, "unable to set blocksize"); + return -EINVAL; + } /* read boot sector */ sbi->boot_bh = sb_bread(sb, 0); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e10c376843d7..dbfe9098a124 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1398,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = EXT2_I(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e99306a8f47c..78ea864fa8cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode) * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ - if (!list_empty_careful(&inode->i_io_list)) - inode_io_list_del(inode); + inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any @@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode) if (!S_ISREG(inode->i_mode) || IS_NOQUOTA(inode) || IS_VERITY(inode) || is_special_ino(inode->i_sb, inode->i_ino) || - (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || ext4_verity_in_progress(inode)) return; @@ -1319,8 +1318,8 @@ retry_grab: if (IS_ERR(folio)) return PTR_ERR(folio); - if (pos + len > folio_pos(folio) + folio_size(folio)) - len = folio_pos(folio) + folio_size(folio) - pos; + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; from = offset_in_folio(folio, pos); to = from + len; @@ -2619,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_folio(mpd->inode); - if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(mpd->wbc); mpd->map.m_len = 0; mpd->next_pos = mpd->start_pos; @@ -2704,7 +2700,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->start_pos = folio_pos(folio); - mpd->next_pos = folio_pos(folio) + folio_size(folio); + mpd->next_pos = folio_next_pos(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -3146,8 +3142,8 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - if (pos + len > folio_pos(folio) + folio_size(folio)) - len = folio_pos(folio) + folio_size(folio) - pos; + if (len > folio_next_pos(folio) - pos) + len = folio_next_pos(folio) - pos; ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); @@ -3473,7 +3469,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; - return inode->i_state & I_DIRTY_DATASYNC; + return inode_state_read_once(inode) & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, @@ -4552,7 +4548,7 @@ int ext4_truncate(struct inode *inode) * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ - if (!(inode->i_state & (I_NEW|I_FREEING))) + if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); @@ -5210,7 +5206,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); @@ -5549,7 +5545,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index ab1ff51302fb..6f57c181ff77 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb, static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { - int err; - /* * We protect against freezing so that we don't create dirty buffers * on frozen filesystem. */ - sb_start_write(sb); - err = write_mmp_block_thawed(sb, bh); - sb_end_write(sb); - return err; + scoped_guard(super_write, sb) + return write_mmp_block_thawed(sb, bh); } /* diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 82d5e7501455..5fd54adf0c88 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!sbi->s_journal || is_bad_inode(inode)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_inode_orphan_tracked(inode)) return 0; @@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) return 0; - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) return ext4_orphan_file_del(handle, inode); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index d4d7f329d23f..fa8d81a30fb9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -9,6 +9,7 @@ * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ +#include <linux/fs_struct.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "xattr.h" diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..be53e06caf3d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1329,7 +1329,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, } folio = page_folio(cc->rpages[last_index]); - psize = folio_pos(folio) + folio_size(folio); + psize = folio_next_pos(folio); err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 775aa4f63aa3..8bf4feda42b0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -4222,7 +4219,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; - if ((inode->i_state & I_DIRTY_DATASYNC) || + if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8c4eafe9ffac..f1cda1900658 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -569,7 +569,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (is_meta_ino(sbi, ino)) { f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b882771e4699..af40282a6948 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -844,7 +844,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } else { if (file) @@ -1057,7 +1057,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto put_out_dir; spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; + inode_state_clear(whiteout, I_LINKABLE); spin_unlock(&whiteout->i_lock); iput(whiteout); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index db7afb806411..47489d48f2b9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1798,7 +1798,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { + if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ __iget(inode); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 9648ed097816..0b6009cd1844 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -22,6 +22,7 @@ #include <linux/unaligned.h> #include <linux/random.h> #include <linux/iversion.h> +#include <linux/fs_struct.h> #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET @@ -1595,8 +1596,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc, setup(sb); /* flavour-specific stuff that needs options */ + error = -EINVAL; + if (!sb_min_blocksize(sb, 512)) { + fat_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } error = -EIO; - sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); diff --git a/fs/fcntl.c b/fs/fcntl.c index 72f8433d9109..f93dbca08435 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -445,6 +445,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + struct delegation deleg; int argi = (int)arg; struct flock flock; long err = -EINVAL; @@ -550,6 +551,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_SET_RW_HINT: err = fcntl_set_rw_hint(filp, arg); break; + case F_GETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_getdeleg(filp, &deleg); + if (!err && copy_to_user(argp, &deleg, sizeof(deleg))) + return -EFAULT; + break; + case F_SETDELEG: + if (copy_from_user(&deleg, argp, sizeof(deleg))) + return -EFAULT; + err = fcntl_setdeleg(fd, filp, &deleg); + break; default: break; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 052f9c9368fb..3de1547ec9d4 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -404,32 +404,28 @@ out_path: return retval; } +static struct file *file_open_handle(struct path *path, int open_flag) +{ + const struct export_operations *eops; + + eops = path->mnt->mnt_sb->s_export_op; + if (eops->open) + return eops->open(path, open_flag); + + return file_open_root(path, "", open_flag, 0); +} + static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { - long retval = 0; + long retval; struct path path __free(path_put) = {}; - struct file *file; - const struct export_operations *eops; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; - CLASS(get_unused_fd, fd)(open_flag); - if (fd < 0) - return fd; - - eops = path.mnt->mnt_sb->s_export_op; - if (eops->open) - file = eops->open(&path, open_flag); - else - file = file_open_root(&path, "", open_flag, 0); - if (IS_ERR(file)) - return PTR_ERR(file); - - fd_install(fd, file); - return take_fd(fd); + return FD_ADD(open_flag, file_open_handle(&path, open_flag)); } /** diff --git a/fs/file.c b/fs/file.c index 28743b742e3c..0a4f3bdb2dec 100644 --- a/fs/file.c +++ b/fs/file.c @@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); +/* + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock + * + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. + * + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. + */ +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in @@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file) return; rcu_read_lock_sched(); - if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ @@ -1357,28 +1380,25 @@ out_unlock: */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { - int new_fd; int error; error = security_file_receive(file); if (error) return error; - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; + FD_PREPARE(fdf, o_flags, file); + if (fdf.err) + return fdf.err; + get_file(file); if (ufd) { - error = put_user(new_fd, ufd); - if (error) { - put_unused_fd(new_fd); + error = put_user(fd_prepare_fd(fdf), ufd); + if (error) return error; - } } - fd_install(new_fd, get_file(file)); - __receive_sock(file); - return new_fd; + __receive_sock(fd_prepare_file(fdf)); + return fd_publish(fdf); } EXPORT_SYMBOL_GPL(receive_fd); diff --git a/fs/file_attr.c b/fs/file_attr.c index 1dcec88c0680..4c4916632f11 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -316,7 +316,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) err = put_user(fa.flags, argp); return err; } -EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { @@ -337,7 +336,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { @@ -350,7 +348,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) return err; } -EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { @@ -369,7 +366,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 20600e9ea202..21fc94b98209 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -258,7 +258,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) ip = iget_locked(sbp, ino); if (!ip) return ERR_PTR(-ENOMEM); - if (!(ip->i_state & I_NEW)) + if (!(inode_state_read_once(ip) & I_NEW)) return ip; vip = VXFS_INO(ip); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2b35e80037fe..6800886c4d10 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -14,6 +14,7 @@ * Additions for address_space-based writeback */ +#include <linux/sched/sysctl.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -32,11 +33,6 @@ #include "internal.h" /* - * 4MB minimal write chunk size - */ -#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) - -/* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { @@ -121,7 +117,7 @@ static bool inode_io_list_move_locked(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); list_move(&inode->i_io_list, head); @@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb, spin_unlock_irq(&wb->work_lock); } +static bool wb_wait_for_completion_cb(struct wb_completion *done) +{ + unsigned long waited_secs = (jiffies - done->wait_start) / HZ; + + done->progress_stamp = jiffies; + if (waited_secs > sysctl_hung_task_timeout_secs) + pr_info("INFO: The task %s:%d has been waiting for writeback " + "completion for more than %lu seconds.", + current->comm, current->pid, waited_secs); + + return !atomic_read(&done->cnt); +} + /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion @@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb, */ void wb_wait_for_completion(struct wb_completion *done) { + done->wait_start = jiffies; atomic_dec(&done->cnt); /* put down the initial count */ - wait_event(*done->waitq, !atomic_read(&done->cnt)); + wait_event(*done->waitq, wb_wait_for_completion_cb(done)); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -304,9 +314,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else @@ -408,7 +418,7 @@ static bool inode_do_switch_wbs(struct inode *inode, * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) + if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -451,7 +461,7 @@ static bool inode_do_switch_wbs(struct inode *inode, if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; - if (inode->i_state & I_DIRTY_ALL) { + if (inode_state_read(inode) & I_DIRTY_ALL) { /* * We need to keep b_dirty list sorted by * dirtied_time_when. However properly sorting the @@ -476,10 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode, switched = true; skip_switch: /* - * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * Paired with an acquire fence in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ - smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + smp_wmb(); + inode_state_clear(inode, I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); @@ -600,12 +611,12 @@ static bool inode_prepare_wbs_switch(struct inode *inode, /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } - inode->i_state |= I_WB_SWITCH; + inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); @@ -635,7 +646,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ - if (inode->i_state & I_WB_SWITCH) + if (inode_state_read_once(inode) & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ @@ -807,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, * @wbc: writeback_control of interest * @inode: target inode * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. + * This function is to be used by filemap_writeback(), which is an alternative + * entry point into writeback code, and first ensures @inode is associated with + * a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) @@ -1236,9 +1247,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode, { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_FREEING); + WARN_ON_ONCE(inode_state_read(inode) & I_FREEING); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -1348,10 +1359,17 @@ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; + /* + * FIXME: ext4 can call here from ext4_evict_inode() after evict() already + * unlinked the inode. + */ + if (list_empty_careful(&inode->i_io_list)) + return; + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); @@ -1409,13 +1427,13 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); /* * When the inode is being freed just don't bother with dirty list * tracking. Flush worker will ignore this inode anyway and it will * trigger assertions in inode_io_list_move_locked(). */ - if (inode->i_state & I_FREEING) { + if (inode_state_read(inode) & I_FREEING) { list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); return; @@ -1449,9 +1467,9 @@ static void inode_sync_complete(struct inode *inode) { assert_spin_locked(&inode->i_lock); - inode->i_state &= ~I_SYNC; + inode_state_clear(inode, I_SYNC); /* If inode is clean an unused, put it into LRU now... */ - inode_add_lru(inode); + inode_lru_list_add(inode); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_SYNC); } @@ -1493,7 +1511,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - inode->i_state |= I_SYNC_QUEUED; + inode_state_set(inode, I_SYNC_QUEUED); spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; @@ -1579,14 +1597,14 @@ void inode_wait_for_writeback(struct inode *inode) assert_spin_locked(&inode->i_lock); - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - if (!(inode->i_state & I_SYNC)) + if (!(inode_state_read(inode) & I_SYNC)) break; spin_unlock(&inode->i_lock); schedule(); @@ -1612,7 +1630,7 @@ static void inode_sleep_on_writeback(struct inode *inode) wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ - sleep = !!(inode->i_state & I_SYNC); + sleep = !!(inode_state_read(inode) & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); @@ -1631,7 +1649,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc, unsigned long dirtied_before) { - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) return; /* @@ -1639,7 +1657,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ - if ((inode->i_state & I_DIRTY) && + if ((inode_state_read(inode) & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; @@ -1650,7 +1668,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ - if (inode->i_state & I_DIRTY_ALL) + if (inode_state_read(inode) & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); @@ -1676,17 +1694,17 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, */ redirty_tail_locked(inode, wb); } - } else if (inode->i_state & I_DIRTY) { + } else if (inode_state_read(inode) & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); - } else if (inode->i_state & I_DIRTY_TIME) { + } else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); - inode->i_state &= ~I_SYNC_QUEUED; + inode_state_clear(inode, I_SYNC_QUEUED); } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); @@ -1712,7 +1730,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; - WARN_ON(!(inode->i_state & I_SYNC)); + WARN_ON(!(inode_state_read_once(inode) & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); @@ -1736,7 +1754,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ - if ((inode->i_state & I_DIRTY_TIME) && + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { @@ -1751,8 +1769,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); - dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~dirty; + dirty = inode_state_read(inode) & I_DIRTY; + inode_state_clear(inode, dirty); /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -1768,10 +1786,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state |= I_DIRTY_PAGES; - else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { - if (!(inode->i_state & I_DIRTY_PAGES)) { - inode->i_state &= ~I_PINNING_NETFS_WB; + inode_state_set(inode, I_DIRTY_PAGES); + else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + if (!(inode_state_read(inode) & I_DIRTY_PAGES)) { + inode_state_clear(inode, I_PINNING_NETFS_WB); wbc->unpinned_netfs_wb = true; dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } @@ -1807,11 +1825,11 @@ static int writeback_single_inode(struct inode *inode, spin_lock(&inode->i_lock); if (!icount_read(inode)) - WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); + WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING))); else - WARN_ON(inode->i_state & I_WILL_FREE); + WARN_ON(inode_state_read(inode) & I_WILL_FREE); - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we @@ -1822,7 +1840,7 @@ static int writeback_single_inode(struct inode *inode, goto out; inode_wait_for_writeback(inode); } - WARN_ON(inode->i_state & I_SYNC); + WARN_ON(inode_state_read(inode) & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * @@ -1830,11 +1848,11 @@ static int writeback_single_inode(struct inode *inode, * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ - if (!(inode->i_state & I_DIRTY_ALL) && + if (!(inode_state_read(inode) & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); @@ -1847,18 +1865,18 @@ static int writeback_single_inode(struct inode *inode, * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ - if (!(inode->i_state & I_FREEING)) { + if (!(inode_state_read(inode) & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); - else if (!(inode->i_state & I_SYNC_QUEUED)) { - if ((inode->i_state & I_DIRTY)) + else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) { + if ((inode_state_read(inode) & I_DIRTY)) redirty_tail_locked(inode, wb); - else if (inode->i_state & I_DIRTY_TIME) { + else if (inode_state_read(inode) & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, @@ -1874,8 +1892,8 @@ out: return ret; } -static long writeback_chunk_size(struct bdi_writeback *wb, - struct wb_writeback_work *work) +static long writeback_chunk_size(struct super_block *sb, + struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -1893,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb, * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) - pages = LONG_MAX; - else { - pages = min(wb->avg_write_bandwidth / 2, - global_wb_domain.dirty_limit / DIRTY_SCOPE); - pages = min(pages, work->nr_pages); - pages = round_down(pages + MIN_WRITEBACK_PAGES, - MIN_WRITEBACK_PAGES); - } + return LONG_MAX; - return pages; + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); + pages = min(pages, work->nr_pages); + return round_down(pages + sb->s_min_writeback_pages, + sb->s_min_writeback_pages); } /* @@ -1967,12 +1982,12 @@ static long writeback_sb_inodes(struct super_block *sb, * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } - if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { + if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to @@ -1994,17 +2009,17 @@ static long writeback_sb_inodes(struct super_block *sb, * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ - if (inode->i_state & I_SYNC) { + if (inode_state_read(inode) & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } - inode->i_state |= I_SYNC; + inode_state_set(inode, I_SYNC); wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb, work); + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -2014,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + /* Report progress to inform the hung task detector of the progress. */ + if (work->done && work->done->progress_stamp && + (jiffies - work->done->progress_stamp) > HZ * + sysctl_hung_task_timeout_secs / 2) + wake_up_all(work->done->waitq); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; @@ -2039,7 +2060,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read(inode) & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); @@ -2545,10 +2566,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - if (inode->i_state & I_DIRTY_TIME) { - inode->i_state &= ~I_DIRTY_TIME; + if (inode_state_read(inode) & I_DIRTY_TIME) { + inode_state_clear(inode, I_DIRTY_TIME); flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); @@ -2585,16 +2606,16 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if ((inode_state_read_once(inode) & flags) == flags) return; spin_lock(&inode->i_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + if ((inode_state_read(inode) & flags) != flags) { + const int was_dirty = inode_state_read(inode) & I_DIRTY; inode_attach_wb(inode, NULL); - inode->i_state |= flags; + inode_state_set(inode, flags); /* * Grab inode's wb early because it requires dropping i_lock and we @@ -2613,7 +2634,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * the inode it will place it on the appropriate superblock * list, based upon its state. */ - if (inode->i_state & I_SYNC_QUEUED) + if (inode_state_read(inode) & I_SYNC_QUEUED) goto out_unlock; /* @@ -2624,7 +2645,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode_unhashed(inode)) goto out_unlock; } - if (inode->i_state & I_FREEING) + if (inode_state_read(inode) & I_FREEING) goto out_unlock; /* @@ -2639,7 +2660,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (dirtytime) inode->dirtied_time_when = jiffies; - if (inode->i_state & I_DIRTY) + if (inode_state_read(inode) & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; @@ -2736,7 +2757,7 @@ static void wait_sb_inodes(struct super_block *sb) spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); diff --git a/fs/fs_types.c b/fs/fs_dirent.c index 78365e5dc08c..e5e08f213816 100644 --- a/fs/fs_types.c +++ b/fs/fs_dirent.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/fs.h> +#include <linux/fs_dirent.h> #include <linux/export.h> /* diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 28be762ac1c6..b8c46c5a38a0 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -146,12 +146,6 @@ int unshare_fs_struct(void) } EXPORT_SYMBOL_GPL(unshare_fs_struct); -int current_umask(void) -{ - return current->fs->umask; -} -EXPORT_SYMBOL(current_umask); - /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ecaec0fea3a1..87a63ae93a45 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = fc->blkbits; + blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } @@ -1397,27 +1397,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) - goto unlock; + goto put_parent; err = -ENOENT; dir = d_find_alias(parent); if (!dir) - goto unlock; + goto put_parent; - name->hash = full_name_hash(dir, name->name, name->len); - entry = d_lookup(dir, name); + entry = start_removing_noperm(dir, name); dput(dir); - if (!entry) - goto unlock; + if (IS_ERR(entry)) + goto put_parent; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); - if (child_nodeid != 0 && d_really_is_positive(entry)) { + if (child_nodeid != 0) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; @@ -1445,10 +1443,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, } else { err = 0; } - dput(entry); - unlock: - inode_unlock(parent); + end_removing(entry); + put_parent: iput(parent); return err; } @@ -2230,6 +2227,7 @@ static const struct file_operations fuse_dir_operations = { .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, + .setlease = simple_nosetlease, }; static const struct inode_operations fuse_common_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f1ef77a0be05..7bcb650a9f26 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio, return 0; } +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + +struct fuse_fill_read_data { + struct file *file; + + /* Fields below are used if sending the read request asynchronously */ + struct fuse_conn *fc; + struct fuse_io_args *ia; + unsigned int nr_bytes; +}; + +/* forward declarations */ +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write); +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count, bool async); + +static int fuse_handle_readahead(struct folio *folio, + struct readahead_control *rac, + struct fuse_fill_read_data *data, loff_t pos, + size_t len) +{ + struct fuse_io_args *ia = data->ia; + size_t off = offset_in_folio(folio, pos); + struct fuse_conn *fc = data->fc; + struct fuse_args_pages *ap; + unsigned int nr_pages; + + if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes, + false)) { + fuse_send_readpages(ia, data->file, data->nr_bytes, + fc->async_read); + data->nr_bytes = 0; + data->ia = NULL; + ia = NULL; + } + if (!ia) { + if (fc->num_background >= fc->congestion_threshold && + rac->ra->async_size >= readahead_count(rac)) + /* + * Congested and only async pages left, so skip the + * rest. + */ + return -EAGAIN; + + nr_pages = min(fc->max_pages, readahead_count(rac)); + data->ia = fuse_io_alloc(NULL, nr_pages); + if (!data->ia) + return -ENOMEM; + ia = data->ia; + } + folio_get(folio); + ap = &ia->ap; + ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = off; + ap->descs[ap->num_folios].length = len; + data->nr_bytes += len; + ap->num_folios++; + + return 0; +} + +static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, + size_t len) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + struct folio *folio = ctx->cur_folio; + loff_t pos = iter->pos; + size_t off = offset_in_folio(folio, pos); + struct file *file = data->file; + int ret; + + if (ctx->rac) { + ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len); + } else { + /* + * for non-readahead read requests, do reads synchronously + * since it's not guaranteed that the server can handle + * out-of-order reads + */ + ret = fuse_do_readfolio(file, folio, off, len); + if (!ret) + iomap_finish_folio_read(folio, off, len, ret); + } + return ret; +} + +static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +{ + struct fuse_fill_read_data *data = ctx->read_ctx; + + if (data->ia) + fuse_send_readpages(data->ia, data->file, data->nr_bytes, + data->fc->async_read); +} + +static const struct iomap_read_ops fuse_iomap_read_ops = { + .read_folio_range = fuse_iomap_read_folio_range_async, + .submit_read = fuse_iomap_read_submit, +}; + static int fuse_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; - int err; + struct fuse_fill_read_data data = { + .file = file, + }; + struct iomap_read_folio_ctx ctx = { + .cur_folio = folio, + .ops = &fuse_iomap_read_ops, + .read_ctx = &data, - err = -EIO; - if (fuse_is_bad(inode)) - goto out; + }; - err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); - if (!err) - folio_mark_uptodate(folio); + if (fuse_is_bad(inode)) { + folio_unlock(folio); + return -EIO; + } + iomap_read_folio(&fuse_iomap_ops, &ctx); fuse_invalidate_atime(inode); - out: - folio_unlock(folio); - return err; + return 0; } static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, @@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { - folio_end_read(ap->folios[i], !err); + iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset, + ap->descs[i].length, err); folio_put(ap->folios[i]); } if (ia->ff) @@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, - unsigned int count) + unsigned int count, bool async) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; @@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, fuse_read_args_fill(ia, file, pos, count, FUSE_READ); ia->read.attr_ver = fuse_get_attr_version(fm->fc); - if (fm->fc->async_read) { + if (async) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); @@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int max_pages, nr_pages; - struct folio *folio = NULL; + struct fuse_fill_read_data data = { + .file = rac->file, + .fc = fc, + }; + struct iomap_read_folio_ctx ctx = { + .ops = &fuse_iomap_read_ops, + .rac = rac, + .read_ctx = &data + }; if (fuse_is_bad(inode)) return; - max_pages = min_t(unsigned int, fc->max_pages, - fc->max_read / PAGE_SIZE); - - /* - * This is only accurate the first time through, since readahead_folio() - * doesn't update readahead_count() from the previous folio until the - * next call. Grab nr_pages here so we know how many pages we're going - * to have to process. This means that we will exit here with - * readahead_count() == folio_nr_pages(last_folio), but we will have - * consumed all of the folios, and read_pages() will call - * readahead_folio() again which will clean up the rac. - */ - nr_pages = readahead_count(rac); - - while (nr_pages) { - struct fuse_io_args *ia; - struct fuse_args_pages *ap; - unsigned cur_pages = min(max_pages, nr_pages); - unsigned int pages = 0; - - if (fc->num_background >= fc->congestion_threshold && - rac->ra->async_size >= readahead_count(rac)) - /* - * Congested and only async pages left, so skip the - * rest. - */ - break; - - ia = fuse_io_alloc(NULL, cur_pages); - if (!ia) - break; - ap = &ia->ap; - - while (pages < cur_pages) { - unsigned int folio_pages; - - /* - * This returns a folio with a ref held on it. - * The ref needs to be held until the request is - * completed, since the splice case (see - * fuse_try_move_page()) drops the ref after it's - * replaced in the page cache. - */ - if (!folio) - folio = __readahead_folio(rac); - - folio_pages = folio_nr_pages(folio); - if (folio_pages > cur_pages - pages) { - /* - * Large folios belonging to fuse will never - * have more pages than max_pages. - */ - WARN_ON(!pages); - break; - } - - ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].length = folio_size(folio); - ap->num_folios++; - pages += folio_pages; - folio = NULL; - } - fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); - nr_pages -= pages; - } - if (folio) { - folio_end_read(folio, false); - folio_put(folio); - } + iomap_readahead(&fuse_iomap_ops, &ctx); } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = { .read_folio_range = fuse_iomap_read_folio_range, }; -static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) -{ - iomap->type = IOMAP_MAPPED; - iomap->length = length; - iomap->offset = offset; - return 0; -} - -static const struct iomap_ops fuse_iomap_ops = { - .iomap_begin = fuse_iomap_begin, -}; - static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ - iomap_finish_folio_write(inode, ap->folios[i], 1); + iomap_finish_folio_write(inode, ap->folios[i], + ap->descs[i].length); wake_up(&fi->page_waitq); } @@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data { struct fuse_file *ff; unsigned int max_folios; /* - * nr_bytes won't overflow since fuse_writepage_need_send() caps + * nr_bytes won't overflow since fuse_folios_need_send() caps * wb requests to never exceed fc->max_pages (which has an upper bound * of U16_MAX). */ @@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode, spin_unlock(&fi->lock); } -static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, - unsigned len, struct fuse_args_pages *ap, - struct fuse_fill_wb_data *data) +static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, + unsigned cur_bytes, bool write) { struct folio *prev_folio; struct fuse_folio_desc prev_desc; - unsigned bytes = data->nr_bytes + len; + unsigned bytes = cur_bytes + len; loff_t prev_pos; + size_t max_bytes = write ? fc->max_write : fc->max_read; WARN_ON(!ap->num_folios); @@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; - /* Reached max write bytes */ - if (bytes > fc->max_write) + if (bytes > max_bytes) return true; /* Discontinuity */ @@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, if (prev_pos != pos) return true; - /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && - !fuse_pages_realloc(data, fc->max_pages)) - return true; - return false; } @@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, return -EIO; } - if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { - fuse_writepages_send(inode, data); - data->wpa = NULL; - data->nr_bytes = 0; + if (wpa) { + bool send = fuse_folios_need_send(fc, pos, len, ap, + data->nr_bytes, true); + + if (!send) { + /* + * Need to grow the pages array? If so, did the + * expansion fail? + */ + send = (ap->num_folios == data->max_folios) && + !fuse_pages_realloc(data, fc->max_pages); + } + + if (send) { + fuse_writepages_send(inode, data); + data->wpa = NULL; + data->nr_bytes = 0; + } } if (data->wpa == NULL) { @@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, ap = &wpa->ia.ap; } - iomap_start_folio_write(inode, folio, 1); fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, offset, len); data->nr_bytes += len; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c2f2a48156d6..f616c1991fed 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -981,14 +981,6 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; - - /* - * This is a workaround until fuse uses iomap for reads. - * For fuseblk servers, this represents the blocksize passed in at - * mount time and for regular fuse servers, this is equivalent to - * inode->i_blkbits. - */ - u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d1babf56f254..1a397be53f49 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -160,7 +160,7 @@ static void fuse_evict_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); /* Will write inode on close/munmap and in all other dirtiers */ - WARN_ON(inode->i_state & I_DIRTY_INODE); + WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE); if (FUSE_IS_DAX(inode)) dax_break_layout_final(inode); @@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = fc->blkbits; + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -505,7 +505,7 @@ retry: if (!inode) return NULL; - if ((inode->i_state & I_NEW)) { + if ((inode_state_read_once(inode) & I_NEW)) { inode->i_flags |= S_NOATIME; if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; @@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; - /* - * This is a workaround until fuse hooks into iomap for reads. - * Use PAGE_SIZE for the blocksize else if the writeback cache - * is enabled, buffered writes go through iomap and a read may - * overwrite partially written data if blocksize < PAGE_SIZE - */ - fc->blkbits = sb->s_blocksize_bits; - if (ctx->blksize != PAGE_SIZE && - !sb_set_blocksize(sb, PAGE_SIZE)) - goto err; #endif fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6bc7c97b017d..b2f6486fe1d5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) sprintf(buff, "%d", i); fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); - if (!fs->mqs_kobj) { + if (!fsvq->kobj) { ret = -ENOMEM; goto out_del; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 47d74afd63ac..ff1cf335449a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -81,8 +81,7 @@ static int gfs2_write_jdata_folio(struct folio *folio, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (folio_pos(folio) < i_size && - i_size < folio_pos(folio) + folio_size(folio)) + if (folio_pos(folio) < i_size && i_size < folio_next_pos(folio)) folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); @@ -311,10 +310,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; + tag = wbc_to_tag(wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -424,11 +420,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int error = 0; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { - error = iomap_read_folio(folio, &gfs2_iomap_ops); + iomap_bio_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { @@ -503,7 +499,7 @@ static void gfs2_readahead(struct readahead_control *rac) else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else - iomap_readahead(rac, &gfs2_iomap_ops); + iomap_bio_readahead(rac, &gfs2_iomap_ops); } /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index bc67fa058c84..ee92f5910ae1 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -744,7 +744,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode_state_read_once(inode) & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b677c0e6b9ab..c9712235e7a0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { - wait_on_inode(&ip->i_inode); + wait_on_new_inode(&ip->i_inode); if (is_bad_inode(&ip->i_inode)) { iput(&ip->i_inode); ip = NULL; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 0c0a80b3baca..c94e42b0c94d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -394,7 +394,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); struct inode *inode = &ip->i_inode; - bool is_new = inode->i_state & I_NEW; + bool is_new = inode_state_read_once(inode) & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) { gfs2_consist_inode(ip); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8a7ed80d9f2d..890c87e3e365 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -127,7 +127,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; int extra_flags = 0; @@ -924,7 +924,7 @@ fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) iget_failed(inode); else iput(inode); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index aa15183f9a16..889682f051ea 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1751,7 +1751,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) && !need_resched()) { spin_unlock(&inode->i_lock); continue; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 22e62fe7448b..54c20d01c342 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -42,7 +42,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke tree->inode = iget_locked(sb, id); if (!tree->inode) goto free_tree; - BUG_ON(!(tree->inode->i_state & I_NEW)); + BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW)); { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; HFS_I(tree->inode)->flags = 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9cd449913dc8..81ad93e6312f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -412,7 +412,7 @@ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_ return NULL; } inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) unlock_new_inode(inode); return inode; } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index a66a09a56bf7..9b377481f397 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nls.h> diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 16bc4abc67e0..54e85e25a259 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -65,7 +65,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; atomic_set(&HFSPLUS_I(inode)->opencnt, 0); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 1e1acf5775ab..51d26aa2b93e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -581,7 +581,7 @@ static struct inode *hostfs_iget(struct super_block *sb, char *name) if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { unlock_new_inode(inode); } else { spin_lock(&inode->i_lock); @@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hostfs_fs_info *fsi = fc->s_fs_info; struct fs_parse_result result; - char *host_root; + char *host_root, *tmp_root; int opt; opt = fs_parse(fc, hostfs_param_specs, param, &result); @@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_hostfs: host_root = param->string; if (!*host_root) - host_root = ""; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + break; + tmp_root = kasprintf(GFP_KERNEL, "%s%s", + fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; break; } @@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) static int hostfs_parse_monolithic(struct fs_context *fc, void *data) { struct hostfs_fs_info *fsi = fc->s_fs_info; - char *host_root = (char *)data; + char *tmp_root, *host_root = (char *)data; /* NULL is printed as '(null)' by printf(): avoid that. */ if (host_root == NULL) - host_root = ""; + return 0; - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) + tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root); + if (!tmp_root) return -ENOMEM; - + kfree(fsi->host_root_path); + fsi->host_root_path = tmp_root; return 0; } @@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc) if (!fsi) return -ENOMEM; + fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino); + if (!fsi->host_root_path) { + kfree(fsi); + return -ENOMEM; + } fc->s_fs_info = fsi; fc->ops = &hostfs_context_ops; return 0; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 49dd585c2b17..ceb50b2dc91a 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in result = ERR_PTR(-ENOMEM); goto bail1; } - if (result->i_state & I_NEW) { + if (inode_state_read_once(result) & I_NEW) { hpfs_init_inode(result); if (de->directory) hpfs_read_inode(result); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 34008442ee26..93d528f4f4f2 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -196,7 +196,7 @@ void hpfs_write_inode(struct inode *i) parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir); if (parent) { hpfs_inode->i_dirty = 0; - if (parent->i_state & I_NEW) { + if (inode_state_read_once(parent) & I_NEW) { hpfs_init_inode(parent); hpfs_read_inode(parent); unlock_new_inode(parent); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 8ab85e7ac91e..371aa6de8075 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/module.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/init.h> diff --git a/fs/init.c b/fs/init.c index 07f592ccdba8..e0f5429c0a49 100644 --- a/fs/init.c +++ b/fs/init.c @@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), NULL); end_creating_path(&path, dentry); return error; } @@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname) error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, oldname); + dentry, oldname, NULL); end_creating_path(&path, dentry); return error; } @@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) error = security_path_mkdir(&path, dentry, mode); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, NULL); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..cc8265cfe80e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -233,7 +233,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; - inode->i_state = 0; + inode_state_assign_raw(inode, 0); atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; @@ -471,7 +471,7 @@ EXPORT_SYMBOL(set_nlink); void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { - WARN_ON(!(inode->i_state & I_LINKABLE)); + WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } @@ -530,9 +530,48 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void __inode_add_lru(struct inode *inode, bool rotate) +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + +void wait_on_new_inode(struct inode *inode) +{ + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + + spin_lock(&inode->i_lock); + if (!(inode_state_read(inode) & I_NEW)) { + spin_unlock(&inode->i_lock); + return; + } + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + if (!(inode_state_read(inode) & I_NEW)) + break; + spin_unlock(&inode->i_lock); + schedule(); + spin_lock(&inode->i_lock); + } + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode_state_read(inode) & I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(wait_on_new_inode); + +static void __inode_lru_list_add(struct inode *inode, bool rotate) { - if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + lockdep_assert_held(&inode->i_lock); + + if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (icount_read(inode)) return; @@ -544,32 +583,22 @@ static void __inode_add_lru(struct inode *inode, bool rotate) if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) - inode->i_state |= I_REFERENCED; -} - -struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, - struct inode *inode, u32 bit) -{ - void *bit_address; - - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + inode_state_set(inode, I_REFERENCED); } -EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). - * - * Needs inode->i_lock held. */ -void inode_add_lru(struct inode *inode) +void inode_lru_list_add(struct inode *inode) { - __inode_add_lru(inode, false); + __inode_lru_list_add(inode, false); } static void inode_lru_list_del(struct inode *inode) { + if (list_empty(&inode->i_lru)) + return; + if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -577,15 +606,15 @@ static void inode_lru_list_del(struct inode *inode) static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); - inode->i_state |= I_LRU_ISOLATING; + WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode_state_set(inode, I_LRU_ISOLATING); } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); - inode->i_state &= ~I_LRU_ISOLATING; + WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING)); + inode_state_clear(inode, I_LRU_ISOLATING); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); @@ -597,7 +626,7 @@ static void inode_wait_for_lru_isolating(struct inode *inode) struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); @@ -607,14 +636,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode) * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ - if (!(inode->i_state & I_LRU_ISOLATING)) + if (!(inode_state_read(inode) & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); - WARN_ON(inode->i_state & I_LRU_ISOLATING); + WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING); } /** @@ -761,11 +790,11 @@ void clear_inode(struct inode *inode) */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); - BUG_ON(!(inode->i_state & I_FREEING)); - BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); + BUG_ON(inode_state_read_once(inode) & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ - inode->i_state = I_FREEING | I_CLEAR; + inode_state_assign_raw(inode, I_FREEING | I_CLEAR); } EXPORT_SYMBOL(clear_inode); @@ -786,12 +815,10 @@ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; - BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!(inode_state_read_once(inode) & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_io_list)) - inode_io_list_del(inode); - + inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); @@ -829,7 +856,7 @@ static void evict(struct inode *inode) * This also means we don't need any fences for the call below. */ inode_wake_up_bit(inode, __I_NEW); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR)); destroy_inode(inode); } @@ -879,12 +906,12 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } - inode->i_state |= I_FREEING; + inode_state_set(inode, I_FREEING); inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); @@ -938,7 +965,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * sync, or the last page cache deletion will requeue them. */ if (icount_read(inode) || - (inode->i_state & ~I_REFERENCED) || + (inode_state_read(inode) & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); @@ -947,8 +974,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, } /* Recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; + if (inode_state_read(inode) & I_REFERENCED) { + inode_state_clear(inode, I_REFERENCED); spin_unlock(&inode->i_lock); return LRU_ROTATE; } @@ -975,8 +1002,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_RETRY; } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_set(inode, I_FREEING); list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); @@ -1008,7 +1035,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool is_inode_hash_locked) + void *data, bool is_inode_hash_locked, + bool *isnew) { struct inode *inode = NULL; @@ -1025,16 +1053,17 @@ repeat: if (!test(inode, data)) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1049,7 +1078,7 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool is_inode_hash_locked) + bool is_inode_hash_locked, bool *isnew) { struct inode *inode = NULL; @@ -1066,16 +1095,17 @@ repeat: if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } - if (unlikely(inode->i_state & I_CREATING)) { + if (unlikely(inode_state_read(inode) & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); + *isnew = !!(inode_state_read(inode) & I_NEW); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; @@ -1180,14 +1210,8 @@ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW & ~I_CREATING; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } @@ -1197,14 +1221,8 @@ void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_NEW)); - inode->i_state &= ~I_NEW; - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb(); + WARN_ON(!(inode_state_read(inode) & I_NEW)); + inode_state_clear(inode, I_NEW); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); @@ -1260,6 +1278,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set + * @isnew: pointer to a bool which will indicate whether I_NEW is set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a @@ -1278,12 +1297,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + bool isnew; might_sleep(); again: spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data, true); + old = find_inode(inode->i_sb, head, test, data, true, &isnew); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. @@ -1292,7 +1312,8 @@ again: spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; - wait_on_inode(old); + if (unlikely(isnew)) + wait_on_new_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; @@ -1310,7 +1331,7 @@ again: * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; + inode_state_set(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); @@ -1383,15 +1404,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + bool isnew; might_sleep(); again: - inode = find_inode(sb, head, test, data, false); + inode = find_inode(sb, head, test, data, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1426,15 +1449,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1448,11 +1473,11 @@ again: spin_lock(&inode_hash_lock); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino, true); + old = find_inode_fast(sb, head, ino, true, &isnew); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); - inode->i_state = I_NEW; + inode_state_assign(inode, I_NEW); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); @@ -1474,7 +1499,8 @@ again: if (IS_ERR(old)) return NULL; inode = old; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1545,7 +1571,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { @@ -1578,13 +1604,13 @@ EXPORT_SYMBOL(igrab); * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) + int (*test)(struct inode *, void *), void *data, bool *isnew) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data, true); + inode = find_inode(sb, head, test, data, true, isnew); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; @@ -1612,13 +1638,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + bool isnew; might_sleep(); again: - inode = ilookup5_nowait(sb, hashval, test, data); + inode = ilookup5_nowait(sb, hashval, test, data, &isnew); if (inode) { - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1640,16 +1668,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + bool isnew; might_sleep(); again: - inode = find_inode_fast(sb, head, ino, false); + inode = find_inode_fast(sb, head, ino, false, &isnew); if (inode) { if (IS_ERR(inode)) return NULL; - wait_on_inode(inode); + if (unlikely(isnew)) + wait_on_new_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; @@ -1741,7 +1771,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } @@ -1780,7 +1810,7 @@ struct inode *find_inode_by_ino_rcu(struct super_block *sb, hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; @@ -1792,6 +1822,7 @@ int insert_inode_locked(struct inode *inode) struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + bool isnew; might_sleep(); @@ -1804,7 +1835,7 @@ int insert_inode_locked(struct inode *inode) if (old->i_sb != sb) continue; spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { + if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } @@ -1812,21 +1843,23 @@ int insert_inode_locked(struct inode *inode) } if (likely(!old)) { spin_lock(&inode->i_lock); - inode->i_state |= I_NEW | I_CREATING; + inode_state_set(inode, I_NEW | I_CREATING); hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } - if (unlikely(old->i_state & I_CREATING)) { + if (unlikely(inode_state_read(old) & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); + isnew = !!(inode_state_read(old) & I_NEW); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); - wait_on_inode(old); + if (isnew) + wait_on_new_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; @@ -1843,7 +1876,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, might_sleep(); - inode->i_state |= I_CREATING; + inode_state_set_raw(inode, I_CREATING); old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { @@ -1875,10 +1908,10 @@ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; - unsigned long state; int drop; - WARN_ON(inode->i_state & I_NEW); + WARN_ON(inode_state_read(inode) & I_NEW); + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); if (op->drop_inode) drop = op->drop_inode(inode); @@ -1886,29 +1919,33 @@ static void iput_final(struct inode *inode) drop = inode_generic_drop(inode); if (!drop && - !(inode->i_state & I_DONTCACHE) && + !(inode_state_read(inode) & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - __inode_add_lru(inode, true); + __inode_lru_list_add(inode, true); spin_unlock(&inode->i_lock); return; } - state = inode->i_state; - if (!drop) { - WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + /* + * Re-check ->i_count in case the ->drop_inode() hooks played games. + * Note we only execute this if the verdict was to drop the inode. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode); + + if (drop) { + inode_state_set(inode, I_FREEING); + } else { + inode_state_set(inode, I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); - state = inode->i_state; - WARN_ON(state & I_NEW); - state &= ~I_WILL_FREE; + WARN_ON(inode_state_read(inode) & I_NEW); + inode_state_replace(inode, I_WILL_FREE, I_FREEING); } - WRITE_ONCE(inode->i_state, state | I_FREEING); - if (!list_empty(&inode->i_lru)) - inode_lru_list_del(inode); + inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); @@ -1931,7 +1968,7 @@ void iput(struct inode *inode) retry: lockdep_assert_not_held(&inode->i_lock); - VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode); /* * Note this assert is technically racy as if the count is bogusly * equal to one, then two CPUs racing to further drop it can both @@ -1942,14 +1979,14 @@ retry: if (atomic_add_unless(&inode->i_count, -1, 1)) return; - if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + if ((inode_state_read_once(inode) & I_DIRTY_TIME) && inode->i_nlink) { trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } spin_lock(&inode->i_lock); - if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) { spin_unlock(&inode->i_lock); goto retry; } @@ -1967,6 +2004,18 @@ retry: } EXPORT_SYMBOL(iput); +/** + * iput_not_last - put an inode assuming this is not the last reference + * @inode: inode to put + */ +void iput_not_last(struct inode *inode) +{ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); + + WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); +} +EXPORT_SYMBOL(iput_not_last); + #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file @@ -2310,42 +2359,40 @@ out: } EXPORT_SYMBOL(current_time); -static int inode_needs_update_time(struct inode *inode) +static int file_update_time_flags(struct file *file, unsigned int flags) { + struct inode *inode = file_inode(file); struct timespec64 now, ts; - int sync_it = 0; + int sync_mode = 0; + int ret = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_MTIME; - + sync_mode |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_CTIME; - + sync_mode |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) - sync_it |= S_VERSION; + sync_mode |= S_VERSION; - return sync_it; -} - -static int __file_update_time(struct file *file, int sync_mode) -{ - int ret = 0; - struct inode *inode = file_inode(file); + if (!sync_mode) + return 0; - /* try to update time settings */ - if (!mnt_get_write_access_file(file)) { - ret = inode_update_time(inode, sync_mode); - mnt_put_write_access_file(file); - } + if (flags & IOCB_NOWAIT) + return -EAGAIN; + if (mnt_get_write_access_file(file)) + return 0; + ret = inode_update_time(inode, sync_mode); + mnt_put_write_access_file(file); return ret; } @@ -2365,14 +2412,7 @@ static int __file_update_time(struct file *file, int sync_mode) */ int file_update_time(struct file *file) { - int ret; - struct inode *inode = file_inode(file); - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - - return __file_update_time(file, ret); + return file_update_time_flags(file, 0); } EXPORT_SYMBOL(file_update_time); @@ -2394,7 +2434,6 @@ EXPORT_SYMBOL(file_update_time); static int file_modified_flags(struct file *file, int flags) { int ret; - struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. @@ -2403,17 +2442,7 @@ static int file_modified_flags(struct file *file, int flags) ret = file_remove_privs_flags(file, flags); if (ret) return ret; - - if (unlikely(file->f_mode & FMODE_NOCMTIME)) - return 0; - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - if (flags & IOCB_NOWAIT) - return -EAGAIN; - - return __file_update_time(file, ret); + return file_update_time_flags(file, flags); } /** @@ -2970,7 +2999,7 @@ void dump_inode(struct inode *inode, const char *reason) pr_warn("%s encountered for inode %px\n" "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); + inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/internal.h b/fs/internal.h index 9b2b4d116880..d08d5e2235e9 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -67,6 +67,9 @@ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags); +int lookup_noperm_common(struct qstr *qname, struct dentry *base); /* * namespace.c diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index f7e1c8534c46..a572b8808524 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -14,5 +14,6 @@ iomap-y += trace.o \ iomap-$(CONFIG_BLOCK) += direct-io.o \ ioend.o \ fiemap.o \ - seek.o + seek.o \ + bio.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c new file mode 100644 index 000000000000..fc045f2e4c45 --- /dev/null +++ b/fs/iomap/bio.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2016-2023 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/pagemap.h> +#include "internal.h" +#include "trace.h" + +static void iomap_read_end_io(struct bio *bio) +{ + int error = blk_status_to_errno(bio->bi_status); + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) + iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); + bio_put(bio); +} + +static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +{ + struct bio *bio = ctx->read_ctx; + + if (bio) + submit_bio(bio); +} + +static int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen) +{ + struct folio *folio = ctx->cur_folio; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos; + size_t poff = offset_in_folio(folio, pos); + loff_t length = iomap_length(iter); + sector_t sector; + struct bio *bio = ctx->read_ctx; + + sector = iomap_sector(iomap, pos); + if (!bio || bio_end_sector(bio) != sector || + !bio_add_folio(bio, folio, plen, poff)) { + gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); + + if (bio) + submit_bio(bio); + + if (ctx->rac) /* same as readahead_gfp_mask */ + gfp |= __GFP_NORETRY | __GFP_NOWARN; + bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_read_folio does. + */ + if (!bio) + bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + if (ctx->rac) + bio->bi_opf |= REQ_RAHEAD; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = iomap_read_end_io; + bio_add_folio_nofail(bio, folio, plen, poff); + ctx->read_ctx = bio; + } + return 0; +} + +const struct iomap_read_ops iomap_bio_read_ops = { + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, +}; +EXPORT_SYMBOL_GPL(iomap_bio_read_ops); + +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct bio_vec bvec; + struct bio bio; + + bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); + return submit_bio_wait(&bio); +} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8b847a1e27f1..e5c1ca440d93 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/swap.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -37,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio, return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, - unsigned int block) +/* + * Find the next uptodate block in the folio. end_blk is inclusive. + * If no uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_uptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + + return find_next_bit(ifs->state, end_blk + 1, start_blk); +} + +/* + * Find the next non-uptodate block in the folio. end_blk is inclusive. + * If no non-uptodate block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_nonuptodate_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { - return test_bit(block, ifs->state); + struct iomap_folio_state *ifs = folio->private; + + return find_next_zero_bit(ifs->state, end_blk + 1, start_blk); } static bool ifs_set_range_uptodate(struct folio *folio, @@ -75,13 +94,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off, folio_mark_uptodate(folio); } -static inline bool ifs_block_is_dirty(struct folio *folio, - struct iomap_folio_state *ifs, int block) +/* + * Find the next dirty block in the folio. end_blk is inclusive. + * If no dirty block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_dirty_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) { + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int blks = i_blocks_per_folio(inode, folio); + + return find_next_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; +} + +/* + * Find the next clean block in the folio. end_blk is inclusive. + * If no clean block is found, this will return end_blk + 1. + */ +static unsigned ifs_next_clean_block(struct folio *folio, + unsigned start_blk, unsigned end_blk) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + unsigned int blks = i_blocks_per_folio(inode, folio); - return test_bit(block + blks_per_folio, ifs->state); + return find_next_zero_bit(ifs->state, blks + end_blk + 1, + blks + start_blk) - blks; } static unsigned ifs_find_dirty_range(struct folio *folio, @@ -92,18 +132,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio, offset_in_folio(folio, *range_start) >> inode->i_blkbits; unsigned end_blk = min_not_zero( offset_in_folio(folio, range_end) >> inode->i_blkbits, - i_blocks_per_folio(inode, folio)); - unsigned nblks = 1; + i_blocks_per_folio(inode, folio)) - 1; + unsigned nblks; - while (!ifs_block_is_dirty(folio, ifs, start_blk)) - if (++start_blk == end_blk) - return 0; - - while (start_blk + nblks < end_blk) { - if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) - break; - nblks++; - } + start_blk = ifs_next_dirty_block(folio, start_blk, end_blk); + if (start_blk > end_blk) + return 0; + if (start_blk == end_blk) + nblks = 1; + else + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) - + start_blk; *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); return nblks << inode->i_blkbits; @@ -218,6 +257,22 @@ static void ifs_free(struct folio *folio) } /* + * Calculate how many bytes to truncate based off the number of blocks to + * truncate and the end position to start truncating from. + */ +static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits, + unsigned blocks_truncated) +{ + unsigned block_size = 1 << block_bits; + unsigned block_offset = end_pos & (block_size - 1); + + if (!block_offset) + return blocks_truncated << block_bits; + + return ((blocks_truncated - 1) << block_bits) + block_offset; +} + +/* * Calculate the range inside the folio that we actually need to read. */ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, @@ -240,24 +295,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * to avoid reading in already uptodate ranges. */ if (ifs) { - unsigned int i; - - /* move forward for each leading block marked uptodate */ - for (i = first; i <= last; i++) { - if (!ifs_block_is_uptodate(ifs, i)) - break; - *pos += block_size; - poff += block_size; - plen -= block_size; - first++; + unsigned int next, blocks_skipped; + + next = ifs_next_nonuptodate_block(folio, first, last); + blocks_skipped = next - first; + + if (blocks_skipped) { + unsigned long block_offset = *pos & (block_size - 1); + unsigned bytes_skipped = + (blocks_skipped << block_bits) - block_offset; + + *pos += bytes_skipped; + poff += bytes_skipped; + plen -= bytes_skipped; } + first = next; /* truncate len if we find any trailing uptodate block(s) */ - while (++i <= last) { - if (ifs_block_is_uptodate(ifs, i)) { - plen -= (last - i + 1) * block_size; - last = i - 1; - break; + if (++next <= last) { + next = ifs_next_uptodate_block(folio, next, last); + if (next <= last) { + plen -= iomap_bytes_to_truncate(*pos + plen, + block_bits, last - next + 1); + last = next - 1; } } } @@ -271,7 +331,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, unsigned end = offset_in_folio(folio, isize - 1) >> block_bits; if (first <= end && last > end) - plen -= (last - end) * block_size; + plen -= iomap_bytes_to_truncate(*pos + plen, block_bits, + last - end); } *offp = poff; @@ -320,9 +381,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, return 0; } -#ifdef CONFIG_BLOCK -static void iomap_finish_folio_read(struct folio *folio, size_t off, - size_t len, int error) +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error) { struct iomap_folio_state *ifs = folio->private; bool uptodate = !error; @@ -342,169 +402,201 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (finished) folio_end_read(folio, uptodate); } +EXPORT_SYMBOL_GPL(iomap_finish_folio_read); -static void iomap_read_end_io(struct bio *bio) +static void iomap_read_init(struct folio *folio) { - int error = blk_status_to_errno(bio->bi_status); - struct folio_iter fi; + struct iomap_folio_state *ifs = folio->private; - bio_for_each_folio_all(fi, bio) - iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); - bio_put(bio); + if (ifs) { + size_t len = folio_size(folio); + + /* + * ifs->read_bytes_pending is used to track how many bytes are + * read in asynchronously by the IO helper. We need to track + * this so that we can know when the IO helper has finished + * reading in all the necessary ranges of the folio and can end + * the read. + * + * Increase ->read_bytes_pending by the folio size to start, and + * add a +1 bias. We'll subtract the bias and any uptodate / + * zeroed ranges that did not require IO in iomap_read_end() + * after we're done processing the folio. + * + * We do this because otherwise, we would have to increment + * ifs->read_bytes_pending every time a range in the folio needs + * to be read in, which can get expensive since the spinlock + * needs to be held whenever modifying ifs->read_bytes_pending. + * + * We add the bias to ensure the read has not been ended on the + * folio when iomap_read_end() is called, even if the IO helper + * has already finished reading in the entire folio. + */ + spin_lock_irq(&ifs->state_lock); + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + ifs->read_bytes_pending = len + 1; + spin_unlock_irq(&ifs->state_lock); + } } -struct iomap_readpage_ctx { - struct folio *cur_folio; - bool cur_folio_in_bio; - struct bio *bio; - struct readahead_control *rac; -}; +/* + * This ends IO if no bytes were submitted to an IO helper. + * + * Otherwise, this calibrates ifs->read_bytes_pending to represent only the + * submitted bytes (see comment in iomap_read_init()). If all bytes submitted + * have already been completed by the IO helper, then this will end the read. + * Else the IO helper will end the read after all submitted ranges have been + * read. + */ +static void iomap_read_end(struct folio *folio, size_t bytes_submitted) +{ + struct iomap_folio_state *ifs = folio->private; -static int iomap_readpage_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + if (ifs) { + bool end_read, uptodate; + + spin_lock_irq(&ifs->state_lock); + if (!ifs->read_bytes_pending) { + WARN_ON_ONCE(bytes_submitted); + spin_unlock_irq(&ifs->state_lock); + folio_unlock(folio); + return; + } + + /* + * Subtract any bytes that were initially accounted to + * read_bytes_pending but skipped for IO. The +1 accounts for + * the bias we added in iomap_read_init(). + */ + ifs->read_bytes_pending -= + (folio_size(folio) + 1 - bytes_submitted); + + /* + * If !ifs->read_bytes_pending, this means all pending reads by + * the IO helper have already completed, which means we need to + * end the folio read here. If ifs->read_bytes_pending != 0, + * the IO helper will end the folio read. + */ + end_read = !ifs->read_bytes_pending; + if (end_read) + uptodate = ifs_is_fully_uptodate(folio, ifs); + spin_unlock_irq(&ifs->state_lock); + if (end_read) + folio_end_read(folio, uptodate); + } else if (!bytes_submitted) { + /* + * If there were no bytes submitted, this means we are + * responsible for unlocking the folio here, since no IO helper + * has taken ownership of it. If there were bytes submitted, + * then the IO helper will end the read via + * iomap_finish_folio_read(). + */ + folio_unlock(folio); + } +} + +static int iomap_read_folio_iter(struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted) { const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos; loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; - struct iomap_folio_state *ifs; size_t poff, plen; - sector_t sector; + loff_t pos_diff; int ret; if (iomap->type == IOMAP_INLINE) { ret = iomap_read_inline_data(iter, folio); if (ret) return ret; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } - /* zero post-eof blocks as the page may be mapped */ - ifs = ifs_alloc(iter->inode, folio, iter->flags); - iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); - if (plen == 0) - goto done; + ifs_alloc(iter->inode, folio, iter->flags); - if (iomap_block_needs_zeroing(iter, pos)) { - folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, poff, plen); - goto done; - } + length = min_t(loff_t, length, + folio_size(folio) - offset_in_folio(folio, pos)); + while (length) { + iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, + &plen); - ctx->cur_folio_in_bio = true; - if (ifs) { - spin_lock_irq(&ifs->state_lock); - ifs->read_bytes_pending += plen; - spin_unlock_irq(&ifs->state_lock); - } + pos_diff = pos - iter->pos; + if (WARN_ON_ONCE(pos_diff + plen > length)) + return -EIO; - sector = iomap_sector(iomap, pos); - if (!ctx->bio || - bio_end_sector(ctx->bio) != sector || - !bio_add_folio(ctx->bio, folio, plen, poff)) { - gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - gfp_t orig_gfp = gfp; - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - - if (ctx->bio) - submit_bio(ctx->bio); - - if (ctx->rac) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), - REQ_OP_READ, gfp); - /* - * If the bio_alloc fails, try it again for a single page to - * avoid having to deal with partial page reads. This emulates - * what do_mpage_read_folio does. - */ - if (!ctx->bio) { - ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, - orig_gfp); - } - if (ctx->rac) - ctx->bio->bi_opf |= REQ_RAHEAD; - ctx->bio->bi_iter.bi_sector = sector; - ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio_nofail(ctx->bio, folio, plen, poff); - } + ret = iomap_iter_advance(iter, pos_diff); + if (ret) + return ret; -done: - /* - * Move the caller beyond our range so that it keeps making progress. - * For that, we have to include any leading non-uptodate ranges, but - * we can skip trailing ones as they will be handled in the next - * iteration. - */ - length = pos - iter->pos + plen; - return iomap_iter_advance(iter, &length); -} + if (plen == 0) + return 0; -static int iomap_read_folio_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) -{ - int ret; + /* zero post-eof blocks as the page may be mapped */ + if (iomap_block_needs_zeroing(iter, pos)) { + folio_zero_range(folio, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); + } else { + if (!*bytes_submitted) + iomap_read_init(folio); + ret = ctx->ops->read_folio_range(iter, ctx, plen); + if (ret) + return ret; + *bytes_submitted += plen; + } - while (iomap_length(iter)) { - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_iter_advance(iter, plen); if (ret) return ret; + length -= pos_diff + plen; + pos = iter->pos; } - return 0; } -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) +void iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct folio *folio = ctx->cur_folio; struct iomap_iter iter = { .inode = folio->mapping->host, .pos = folio_pos(folio), .len = folio_size(folio), }; - struct iomap_readpage_ctx ctx = { - .cur_folio = folio, - }; + size_t bytes_submitted = 0; int ret; trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, ctx, + &bytes_submitted); - if (ctx.bio) { - submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_folio_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_folio_in_bio); - folio_unlock(folio); - } + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - /* - * Just like mpage_readahead and block_read_full_folio, we always - * return 0 and just set the folio error flag on errors. This - * should be cleaned up throughout the stack eventually. - */ - return 0; + iomap_read_end(folio, bytes_submitted); } EXPORT_SYMBOL_GPL(iomap_read_folio); static int iomap_readahead_iter(struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx) + struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted) { int ret; while (iomap_length(iter)) { if (ctx->cur_folio && offset_in_folio(ctx->cur_folio, iter->pos) == 0) { - if (!ctx->cur_folio_in_bio) - folio_unlock(ctx->cur_folio); + iomap_read_end(ctx->cur_folio, *cur_bytes_submitted); ctx->cur_folio = NULL; } if (!ctx->cur_folio) { ctx->cur_folio = readahead_folio(ctx->rac); - ctx->cur_folio_in_bio = false; + if (WARN_ON_ONCE(!ctx->cur_folio)) + return -EINVAL; + *cur_bytes_submitted = 0; } - ret = iomap_readpage_iter(iter, ctx); + ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted); if (ret) return ret; } @@ -514,8 +606,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter, /** * iomap_readahead - Attempt to read pages from a file. - * @rac: Describes the pages to be read. * @ops: The operations vector for the filesystem. + * @ctx: The ctx used for issuing readahead. * * This function is for filesystems to call to implement their readahead * address_space operation. @@ -527,51 +619,30 @@ static int iomap_readahead_iter(struct iomap_iter *iter, * function is called with memalloc_nofs set, so allocations will not cause * the filesystem to be reentered. */ -void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx) { + struct readahead_control *rac = ctx->rac; struct iomap_iter iter = { .inode = rac->mapping->host, .pos = readahead_pos(rac), .len = readahead_length(rac), }; - struct iomap_readpage_ctx ctx = { - .rac = rac, - }; + size_t cur_bytes_submitted; trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.status = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, ctx, + &cur_bytes_submitted); - if (ctx.bio) - submit_bio(ctx.bio); - if (ctx.cur_folio) { - if (!ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); - } -} -EXPORT_SYMBOL_GPL(iomap_readahead); - -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct bio_vec bvec; - struct bio bio; + if (ctx->ops->submit_read) + ctx->ops->submit_read(ctx); - bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); - bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); + if (ctx->cur_folio) + iomap_read_end(ctx->cur_folio, cur_bytes_submitted); } -#else -static int iomap_read_folio_range(const struct iomap_iter *iter, - struct folio *folio, loff_t pos, size_t len) -{ - WARN_ON_ONCE(1); - return -EIO; -} -#endif /* CONFIG_BLOCK */ +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a folio are @@ -584,7 +655,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned first, last, i; + unsigned first, last; if (!ifs) return false; @@ -596,10 +667,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) first = from >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits; - for (i = first; i <= last; i++) - if (!ifs_block_is_uptodate(ifs, i)) - return false; - return true; + return ifs_next_nonuptodate_block(folio, first, last) > last; } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); @@ -707,7 +775,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, * are not changing pagecache contents. */ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && - pos + len >= folio_pos(folio) + folio_size(folio)) + pos + len >= folio_next_pos(folio)) return 0; ifs = ifs_alloc(iter->inode, folio, iter->flags); @@ -723,9 +791,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, if (plen == 0) break; - if (!(iter->flags & IOMAP_UNSHARE) && - (from <= poff || from >= poff + plen) && - (to <= poff || to >= poff + plen)) + /* + * If the read range will be entirely overwritten by the write, + * we can skip having to zero/read it in. + */ + if (!(iter->flags & IOMAP_UNSHARE) && from <= poff && + to >= poff + plen) continue; if (iomap_block_needs_zeroing(iter, block_start)) { @@ -742,7 +813,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, status = write_ops->read_folio_range(iter, folio, block_start, plen); else - status = iomap_read_folio_range(iter, + status = iomap_bio_read_folio_range_sync(iter, folio, block_start, plen); if (status) return status; @@ -761,6 +832,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + if (iter->fbatch) { + struct folio *folio = folio_batch_next(iter->fbatch); + + if (!folio) + return NULL; + + /* + * The folio mapping generally shouldn't have changed based on + * fs locks, but be consistent with filemap lookup and retry + * the iter if it does. + */ + folio_lock(folio); + if (unlikely(folio->mapping != iter->inode->i_mapping)) { + iter->iomap.flags |= IOMAP_F_STALE; + folio_unlock(folio); + return NULL; + } + + folio_get(folio); + return folio; + } + if (write_ops && write_ops->get_folio) return write_ops->get_folio(iter, pos, len); return iomap_get_folio(iter, pos, len); @@ -815,15 +908,14 @@ static int iomap_write_begin(struct iomap_iter *iter, size_t *poffset, u64 *plen) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; + loff_t pos; u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); struct folio *folio; int status = 0; len = min_not_zero(len, *plen); - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); - if (srcmap != &iter->iomap) - BUG_ON(pos + len > srcmap->offset + srcmap->length); + *foliop = NULL; + *plen = 0; if (fatal_signal_pending(current)) return -EINTR; @@ -833,6 +925,15 @@ static int iomap_write_begin(struct iomap_iter *iter, return PTR_ERR(folio); /* + * No folio means we're done with a batch. We still have range to + * process so return and let the caller iterate and refill the batch. + */ + if (!folio) { + WARN_ON_ONCE(!iter->fbatch); + return 0; + } + + /* * Now we have a locked folio, before we do anything with it we need to * check that the iomap we have cached is not stale. The inode extent * mapping can change due to concurrent IO in flight (e.g. @@ -852,6 +953,22 @@ static int iomap_write_begin(struct iomap_iter *iter, } } + /* + * The folios in a batch may not be contiguous. If we've skipped + * forward, advance the iter to the pos of the current folio. If the + * folio starts beyond the end of the mapping, it may have been trimmed + * since the lookup for whatever reason. Return a NULL folio to + * terminate the op. + */ + if (folio_pos(folio) > iter->pos) { + len = min_t(u64, folio_pos(folio) - iter->pos, + iomap_length(iter)); + status = iomap_iter_advance(iter, len); + len = iomap_length(iter); + if (status || !len) + goto out_unlock; + } + pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) @@ -1041,7 +1158,7 @@ retry: } } else { total_written += written; - iomap_iter_advance(iter, &written); + iomap_iter_advance(iter, written); } } while (iov_iter_count(i) && iomap_length(iter)); @@ -1082,7 +1199,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, struct folio *folio, loff_t start_byte, loff_t end_byte, struct iomap *iomap, iomap_punch_t punch) { - unsigned int first_blk, last_blk, i; + unsigned int first_blk, last_blk; loff_t last_byte; u8 blkbits = inode->i_blkbits; struct iomap_folio_state *ifs; @@ -1097,14 +1214,14 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode, if (!ifs) return; - last_byte = min_t(loff_t, end_byte - 1, - folio_pos(folio) + folio_size(folio) - 1); + last_byte = min_t(loff_t, end_byte - 1, folio_next_pos(folio) - 1); first_blk = offset_in_folio(folio, start_byte) >> blkbits; last_blk = offset_in_folio(folio, last_byte) >> blkbits; - for (i = first_blk; i <= last_blk; i++) { - if (!ifs_block_is_dirty(folio, ifs, i)) - punch(inode, folio_pos(folio) + (i << blkbits), - 1 << blkbits, iomap); + while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk)) + <= last_blk) { + punch(inode, folio_pos(folio) + (first_blk << blkbits), + 1 << blkbits, iomap); + first_blk++; } } @@ -1129,8 +1246,7 @@ static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, * Make sure the next punch start is correctly bound to * the end of this data range, not the end of the folio. */ - *punch_start_byte = min_t(loff_t, end_byte, - folio_pos(folio) + folio_size(folio)); + *punch_start_byte = min_t(loff_t, end_byte, folio_next_pos(folio)); } /* @@ -1170,7 +1286,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, start_byte, end_byte, iomap, punch); /* move offset to start of next folio in range */ - start_byte = folio_pos(folio) + folio_size(folio); + start_byte = folio_next_pos(folio); folio_unlock(folio); folio_put(folio); } @@ -1310,7 +1426,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter, int status; if (!iomap_want_unshare_iter(iter)) - return iomap_iter_advance(iter, &bytes); + return iomap_iter_advance(iter, bytes); do { struct folio *folio; @@ -1334,10 +1450,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter, balance_dirty_pages_ratelimited(iter->inode->i_mapping); - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); return status; } @@ -1398,6 +1514,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (iter->iomap.flags & IOMAP_F_STALE) break; + /* a NULL folio means we're done with a folio batch */ + if (!folio) { + status = iomap_iter_advance_full(iter); + break; + } + /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); @@ -1412,16 +1534,36 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, if (WARN_ON_ONCE(!ret)) return -EIO; - status = iomap_iter_advance(iter, &bytes); + status = iomap_iter_advance(iter, bytes); if (status) break; - } while (bytes > 0); + } while ((bytes = iomap_length(iter)) > 0); if (did_zero) *did_zero = true; return status; } +loff_t +iomap_fill_dirty_folios( + struct iomap_iter *iter, + loff_t offset, + loff_t length) +{ + struct address_space *mapping = iter->inode->i_mapping; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; + + iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); + if (!iter->fbatch) + return offset + length; + folio_batch_init(iter->fbatch); + + filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); + return (start << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); + int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, @@ -1435,46 +1577,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, .private = private, }; struct address_space *mapping = inode->i_mapping; - unsigned int blocksize = i_blocksize(inode); - unsigned int off = pos & (blocksize - 1); - loff_t plen = min_t(loff_t, len, blocksize - off); int ret; bool range_dirty; /* - * Zero range can skip mappings that are zero on disk so long as - * pagecache is clean. If pagecache was dirty prior to zero range, the - * mapping converts on writeback completion and so must be zeroed. - * - * The simplest way to deal with this across a range is to flush - * pagecache and process the updated mappings. To avoid excessive - * flushing on partial eof zeroing, special case it to zero the - * unaligned start portion if already dirty in pagecache. - */ - if (off && - filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { - iter.len = plen; - while ((ret = iomap_iter(&iter, ops)) > 0) - iter.status = iomap_zero_iter(&iter, did_zero, - write_ops); - - iter.len = len - (iter.pos - pos); - if (ret || !iter.len) - return ret; - } - - /* * To avoid an unconditional flush, check pagecache state and only flush * if dirty and the fs returns a mapping that might convert on * writeback. */ - range_dirty = filemap_range_needs_writeback(inode->i_mapping, - iter.pos, iter.pos + iter.len - 1); + range_dirty = filemap_range_needs_writeback(mapping, iter.pos, + iter.pos + iter.len - 1); while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (srcmap->type == IOMAP_HOLE || - srcmap->type == IOMAP_UNWRITTEN) { + if (WARN_ON_ONCE(iter.fbatch && + srcmap->type != IOMAP_UNWRITTEN)) + return -EIO; + + if (!iter.fbatch && + (srcmap->type == IOMAP_HOLE || + srcmap->type == IOMAP_UNWRITTEN)) { s64 status; if (range_dirty) { @@ -1526,7 +1648,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, @@ -1559,16 +1681,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len) +static void iomap_writeback_init(struct inode *inode, struct folio *folio) { struct iomap_folio_state *ifs = folio->private; WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); - if (ifs) - atomic_add(len, &ifs->write_bytes_pending); + if (ifs) { + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + /* + * Set this to the folio size. After processing the folio for + * writeback in iomap_writeback_folio(), we'll subtract any + * ranges not written back. + * + * We do this because otherwise, we would have to atomically + * increment ifs->write_bytes_pending every time a range in the + * folio needs to be written back. + */ + atomic_set(&ifs->write_bytes_pending, folio_size(folio)); + } } -EXPORT_SYMBOL_GPL(iomap_start_folio_write); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len) @@ -1585,7 +1716,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write); static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, u32 rlen, u64 end_pos, - bool *wb_pending) + size_t *bytes_submitted) { do { ssize_t ret; @@ -1599,11 +1730,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc, pos += ret; /* - * Holes are not be written back by ->writeback_range, so track + * Holes are not written back by ->writeback_range, so track * if we did handle anything that is not a hole here. */ if (wpc->iomap.type != IOMAP_HOLE) - *wb_pending = true; + *bytes_submitted += ret; } while (rlen); return 0; @@ -1674,7 +1805,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); u64 end_aligned = 0; - bool wb_pending = false; + size_t bytes_submitted = 0; int error = 0; u32 rlen; @@ -1694,14 +1825,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) iomap_set_range_dirty(folio, 0, end_pos - pos); } - /* - * Keep the I/O completion handler from clearing the writeback - * bit until we have submitted all blocks by adding a bias to - * ifs->write_bytes_pending, which is dropped after submitting - * all blocks. - */ - WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); - iomap_start_folio_write(inode, folio, 1); + iomap_writeback_init(inode, folio); } /* @@ -1716,13 +1840,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) end_aligned = round_up(end_pos, i_blocksize(inode)); while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) { error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos, - &wb_pending); + &bytes_submitted); if (error) break; pos += rlen; } - if (wb_pending) + if (bytes_submitted) wpc->nr_folios++; /* @@ -1740,12 +1864,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio) * bit ourselves right after unlocking the page. */ if (ifs) { - if (atomic_dec_and_test(&ifs->write_bytes_pending)) - folio_end_writeback(folio); - } else { - if (!wb_pending) - folio_end_writeback(folio); + /* + * Subtract any bytes that were initially accounted to + * write_bytes_pending but skipped for writeback. + */ + size_t bytes_not_submitted = folio_size(folio) - + bytes_submitted; + + if (bytes_not_submitted) + iomap_finish_folio_write(inode, folio, + bytes_not_submitted); + } else if (!bytes_submitted) { + folio_end_writeback(folio); } + mapping_set_error(inode->i_mapping, error); return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..8e273408453a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -16,21 +16,13 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_NO_INVALIDATE (1U << 25) -#define IOMAP_DIO_CALLER_COMP (1U << 26) -#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_NO_INVALIDATE (1U << 26) +#define IOMAP_DIO_COMP_WORK (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) -/* - * Used for sub block zeroing in iomap_dio_zero() - */ -#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) -#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) -static struct page *zero_page; - struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); -static ssize_t iomap_dio_deferred_complete(void *data) -{ - return iomap_dio_complete(data); -} - static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio) WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { - WRITE_ONCE(iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then - * schedule our completion that way to avoid an async punt to a - * workqueue. - */ - /* only polled IO cares about private cleared */ - iocb->private = dio; - iocb->dio_complete = iomap_dio_deferred_complete; + return; + } - /* - * Invoke ->ki_complete() directly. We've assigned our - * dio_complete callback handler, and since the issuer set - * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will - * notice ->dio_complete being set and will defer calling that - * handler until it can be done from a safe task context. - * - * Note that the 'res' being passed in here is not important - * for this case. The actual completion value of the request - * will be gotten from dio_complete when that is run by the - * issuer. - */ - iocb->ki_complete(iocb, 0); - } else { + /* + * Always run error completions in user context. These are not + * performance critical and some code relies on taking sleeping locks + * for error handling. + */ + if (dio->error) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* + * Never invalidate pages from this context to avoid deadlocks with + * buffered I/O completions when called from the ioend workqueue, + * or avoid sleeping when called directly from ->bi_end_io. + * Tough luck if you hit the tiny race with someone dirtying the range + * right between this check and the actual completion. + */ + if ((dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_COMP_WORK)) { + if (dio->iocb->ki_filp->f_mapping->nrpages) + dio->flags |= IOMAP_DIO_COMP_WORK; + else + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + + if (dio->flags & IOMAP_DIO_COMP_WORK) { struct inode *inode = file_inode(iocb->ki_filp); /* @@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio) */ INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + return; } + + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); } void iomap_dio_bio_end_io(struct bio *bio) @@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) /* * Try to avoid another context switch for the completion given * that we are already called from the ioend completion - * workqueue, but never invalidate pages from this thread to - * avoid deadlocks with buffered I/O completions. Tough luck if - * you hit the tiny race with someone dirtying the range now - * between this check and the actual completion. + * workqueue. */ - if (!dio->iocb->ki_filp->f_mapping->nrpages) { - dio->flags |= IOMAP_DIO_INLINE_COMP; - dio->flags |= IOMAP_DIO_NO_INVALIDATE; - } - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); } @@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + struct folio *zero_folio = largest_zero_folio(); + int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio)); if (!len) return 0; + /* - * Max block size supported is 64k + * This limit shall never be reached as most filesystems have a + * maximum blocksize of 64k. */ - if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS)) return -EINVAL; - bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, zero_page, len, 0); + while (len > 0) { + unsigned int io_len = min(len, folio_size(zero_folio)); + + bio_add_folio_nofail(bio, zero_folio, io_len, 0); + len -= io_len; + } iomap_dio_submit_bio(iter, dio, bio, pos); - return 0; -} -/* - * Use a FUA write if we need datasync semantics and this is a pure data I/O - * that doesn't require any metadata updates (including after I/O completion - * such as unwritten extent conversion) and the underlying device either - * doesn't have a volatile write cache or supports FUA. - * This allows us to avoid cache flushes on I/O completion. - */ -static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, - struct iomap_dio *dio) -{ - if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) - return false; - if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) - return false; - return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); + return 0; } static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) int nr_pages, ret = 0; u64 copied = 0; size_t orig_count; + unsigned int alignment; + + /* + * File systems that write out of place and always allocate new blocks + * need each bio to be block aligned as that's the unit of allocation. + */ + if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED) + alignment = fs_block_size; + else + alignment = bdev_logical_block_size(iomap->bdev); - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) + if ((pos | length) & (alignment - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { - bio_opf |= REQ_OP_WRITE; + bool need_completion_work = true; + + switch (iomap->type) { + case IOMAP_MAPPED: + /* + * Directly mapped I/O does not inherently need to do + * work at I/O completion time. But there are various + * cases below where this will get set again. + */ + need_completion_work = false; + break; + case IOMAP_UNWRITTEN: + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + default: + break; + } if (iomap->flags & IOMAP_F_ATOMIC_BIO) { /* @@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio_opf |= REQ_ATOMIC; } - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } - - if (iomap->flags & IOMAP_F_SHARED) + if (iomap->flags & IOMAP_F_SHARED) { + /* + * Unsharing of needs to update metadata at I/O + * completion time. + */ + need_completion_work = true; dio->flags |= IOMAP_DIO_COW; + } - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { + /* + * Newly allocated blocks might need recording in + * metadata at I/O completion time. + */ + need_completion_work = true; need_zeroout = true; - else if (iomap->type == IOMAP_MAPPED && - iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; + } - if (!(bio_opf & REQ_FUA)) - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + /* + * Use a FUA write if we need datasync semantics and this is a + * pure overwrite that doesn't require any metadata updates. + * + * This allows us to avoid cache flushes on I/O completion. + */ + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) { + if (!need_completion_work && + !(iomap->flags & IOMAP_F_DIRTY) && + (!bdev_write_cache(iomap->bdev) || + bdev_fua(iomap->bdev))) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } /* - * We can only do deferred completion for pure overwrites that + * We can only do inline completion for pure overwrites that * don't require additional I/O at completion time. * - * This rules out writes that need zeroing or extent conversion, - * extend the file size, or issue metadata I/O or cache flushes - * during completion processing. + * This rules out writes that need zeroing or metdata updates to + * convert unwritten or shared extents. + * + * Writes that extend i_size are also not supported, but this is + * handled in __iomap_dio_rw(). */ - if (need_zeroout || (pos >= i_size_read(inode)) || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && - !(bio_opf & REQ_FUA))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; + if (need_completion_work) + dio->flags |= IOMAP_DIO_COMP_WORK; + + bio_opf |= REQ_OP_WRITE; } else { bio_opf |= REQ_OP_READ; } @@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. */ - if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) + if (dio->flags & IOMAP_DIO_COMP_WORK) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_end_io = iomap_dio_bio_end_io; ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - bdev_logical_block_size(iomap->bdev) - 1); + alignment - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall @@ -496,7 +520,7 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return iomap_iter_advance(iter, &copied); + return iomap_iter_advance(iter, copied); return ret; } @@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += length; if (!length) return -EFAULT; - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) @@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) dio->size += copied; if (!copied) return -EFAULT; - return iomap_iter_advance(iomi, &copied); + return iomap_iter_advance(iomi, copied); } static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) @@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iov_iter_rw(iter) == READ) { - /* reads can always complete inline */ - dio->flags |= IOMAP_DIO_INLINE_COMP; + if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) + dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - /* - * Flag as supporting deferred completions, if the issuer - * groks it. This can avoid a workqueue punt for writes. - * We may later clear this flag if we need to do other IO - * as part of this IO completion. - */ - if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) - dio->flags |= IOMAP_DIO_CALLER_COMP; - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* + * i_size updates must to happen from process context. + */ + if (iomi.pos + iomi.len > dio->i_size) + dio->flags |= IOMAP_DIO_COMP_WORK; + + /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. @@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } goto out_free_dio; } + } - if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { - ret = sb_init_dio_done_wq(inode->i_sb); - if (ret < 0) - goto out_free_dio; - } + if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + goto out_free_dio; } inode_dio_begin(inode); @@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If all the writes we issued were already written through to the * media, we don't need to flush the cache on IO completion. Clear the * sync flag for this case. + * + * Otherwise clear the inline completion flag if any sync work is + * needed, as that needs to be performed from process context. */ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags |= IOMAP_DIO_COMP_WORK; /* * We are about to drop our additional submission reference, which @@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); - -static int __init iomap_dio_init(void) -{ - zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - IOMAP_ZERO_PAGE_ORDER); - - if (!zero_page) - return -ENOMEM; - - return 0; -} -fs_initcall(iomap_dio_init); diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index d05cb3aed96e..3a4e4aad2bd1 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -6,4 +6,16 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); +#ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len); +#else +static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t len) +{ + WARN_ON_ONCE(1); + return -EIO; +} +#endif /* CONFIG_BLOCK */ + #endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index b49fa75eab26..86f44922ed3b 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -194,8 +194,6 @@ new_ioend: if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; - iomap_start_folio_write(wpc->inode, folio, map_len); - /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index cef77ca0c20b..8692e5e41c6d 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -8,22 +8,24 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { + if (iter->fbatch) { + folio_batch_release(iter->fbatch); + kfree(iter->fbatch); + iter->fbatch = NULL; + } + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } -/* - * Advance the current iterator position and output the length remaining for the - * current mapping. - */ -int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +/* Advance the current iterator position and decrement the remaining length */ +int iomap_iter_advance(struct iomap_iter *iter, u64 count) { - if (WARN_ON_ONCE(*count > iomap_length(iter))) + if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; - iter->pos += *count; - iter->len -= *count; - *count = iomap_length(iter); + iter->pos += count; + iter->len -= count; return 0; } diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 56db2dd4b10d..6cbc587c93da 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); } } @@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter, switch (iter->iomap.type) { case IOMAP_HOLE: - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, length); return 0; default: *hole_pos = iter->pos; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index a61c1dae4742..532787277b16 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter); #define IOMAP_DIO_STRINGS \ - {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ - {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ - {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \ + {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 6f0e6b19383c..b7cbe126faf3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc) goto out_freesbi; } opt->blocksize = sb_min_blocksize(s, opt->blocksize); + if (!opt->blocksize) { + printk(KERN_ERR + "ISOFS: unable to set blocksize\n"); + goto out_freesbi; + } sbi->s_high_sierra = 0; /* default is iso9660 */ sbi->s_session = opt->session; @@ -1515,7 +1520,7 @@ struct inode *__isofs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d175cccb7c55..764bba8ba999 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -265,7 +265,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; f = JFFS2_INODE_INFO(inode); @@ -373,7 +373,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) { struct iattr iattr; - if (!(inode->i_state & I_DIRTY_DATASYNC)) { + if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", __func__, inode->i_ino); return; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2a4a288b821c..87ad042221e7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -26,8 +26,8 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; inode_lock(inode); - if (!(inode->i_state & I_DIRTY_ALL) || - (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + if (!(inode_state_read_once(inode) & I_DIRTY_ALL) || + (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); inode_unlock(inode); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 21f3d029da7d..4709762713ef 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -29,7 +29,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ret = diRead(inode); diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 10934f9a11be..5aaafedb8fbc 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -76,14 +76,14 @@ struct jfs_inode_info { struct { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ - /* _inline may overflow into _inline_ea when needed */ + /* _inline_sym may overflow into _inline_ea when needed */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ union { struct { /* 128: inline symlink */ - unchar _inline[128]; + unchar _inline_sym[128]; /* 128: inline extended attr */ unchar _inline_ea[128]; }; @@ -101,7 +101,7 @@ struct jfs_inode_info { #define i_imap u.file._imap #define i_dirtable u.dir._table #define i_dtroot u.dir._dtroot -#define i_inline u.link._inline +#define i_inline u.link._inline_sym #define i_inline_ea u.link._inline_ea #define i_inline_all u.link._inline_all diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 7840a03e5bcb..c16578af3a77 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1287,7 +1287,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. * Joern */ - if (tblk->u.ip->i_state & I_SYNC) + if (inode_state_read_once(tblk->u.ip) & I_SYNC) tblk->xflag &= ~COMMIT_LAZY; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 457f91c412d4..a36aaee98dce 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -251,7 +251,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); - if (inode && (inode->i_state & I_NEW)) + if (inode && (inode_state_read_once(inode) & I_NEW)) kernfs_init_inode(kn, inode); return inode; diff --git a/fs/libfs.c b/fs/libfs.c index ce8c496a6940..2d6657947abd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; + s->s_d_flags |= ctx->s_d_flags; root = new_inode(s); if (!root) return -ENOMEM; @@ -1542,9 +1543,9 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) + if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); @@ -1664,7 +1665,7 @@ struct inode *alloc_anon_inode(struct super_block *s) * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); /* * Historically anonymous inodes don't have a type at all and * userspace has come to rely on this. @@ -2289,27 +2290,25 @@ void stashed_dentry_prune(struct dentry *dentry) cmpxchg(stashed, dentry, NULL); } -/* parent must be held exclusive */ +/** + * simple_start_creating - prepare to create a given name + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Required lock is taken and a lookup in performed prior to creating an + * object in a directory. No permission checking is performed. + * + * Returns: a negative dentry on which vfs_create() or similar may + * be attempted, or an error. + */ struct dentry *simple_start_creating(struct dentry *parent, const char *name) { - struct dentry *dentry; - struct inode *dir = d_inode(parent); + struct qstr qname = QSTR(name); + int err; - inode_lock(dir); - if (unlikely(IS_DEADDIR(dir))) { - inode_unlock(dir); - return ERR_PTR(-ENOENT); - } - dentry = lookup_noperm(&QSTR(name), parent); - if (IS_ERR(dentry)) { - inode_unlock(dir); - return dentry; - } - if (dentry->d_inode) { - dput(dentry); - inode_unlock(dir); - return ERR_PTR(-EEXIST); - } - return dentry; + err = lookup_noperm_common(&qname, parent); + if (err) + return ERR_PTR(err); + return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL); } EXPORT_SYMBOL(simple_start_creating); diff --git a/fs/locks.c b/fs/locks.c index 04a3f0e20724..9f565802a88c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -585,7 +585,7 @@ static const struct lease_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lease *fl) +static int lease_init(struct file *filp, unsigned int flags, int type, struct file_lease *fl) { if (assign_type(&fl->c, type) != 0) return -EINVAL; @@ -594,13 +594,13 @@ static int lease_init(struct file *filp, int type, struct file_lease *fl) fl->c.flc_pid = current->tgid; fl->c.flc_file = filp; - fl->c.flc_flags = FL_LEASE; + fl->c.flc_flags = flags; fl->fl_lmops = &lease_manager_ops; return 0; } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lease *lease_alloc(struct file *filp, int type) +static struct file_lease *lease_alloc(struct file *filp, unsigned int flags, int type) { struct file_lease *fl = locks_alloc_lease(); int error = -ENOMEM; @@ -608,7 +608,7 @@ static struct file_lease *lease_alloc(struct file *filp, int type) if (fl == NULL) return ERR_PTR(error); - error = lease_init(filp, type, fl); + error = lease_init(filp, flags, type, fl); if (error) { locks_free_lease(fl); return ERR_PTR(error); @@ -1529,29 +1529,35 @@ any_leases_conflict(struct inode *inode, struct file_lease *breaker) /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return - * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR: - * break all leases - * @type: FL_LEASE: break leases and delegations; FL_DELEG: break - * only delegations + * @flags: LEASE_BREAK_* flags * * break_lease (inlined for speed) has checked there already is at least * some kind of lock (maybe a lease) on this file. Leases are broken on - * a call to open() or truncate(). This function can sleep unless you - * specified %O_NONBLOCK to your open(). + * a call to open() or truncate(). This function can block waiting for the + * lease break unless you specify LEASE_BREAK_NONBLOCK. */ -int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +int __break_lease(struct inode *inode, unsigned int flags) { - int error = 0; - struct file_lock_context *ctx; struct file_lease *new_fl, *fl, *tmp; + struct file_lock_context *ctx; unsigned long break_time; - int want_write = (mode & O_ACCMODE) != O_RDONLY; + unsigned int type; LIST_HEAD(dispose); + bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY); + int error = 0; - new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (flags & LEASE_BREAK_LEASE) + type = FL_LEASE; + else if (flags & LEASE_BREAK_DELEG) + type = FL_DELEG; + else if (flags & LEASE_BREAK_LAYOUT) + type = FL_LAYOUT; + else + return -EINVAL; + + new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) return PTR_ERR(new_fl); - new_fl->c.flc_flags = type; /* typically we will check that ctx is non-NULL before calling */ ctx = locks_inode_context(inode); @@ -1596,7 +1602,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) if (list_empty(&ctx->flc_lease)) goto out; - if (mode & O_NONBLOCK) { + if (flags & LEASE_BREAK_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; @@ -1675,8 +1681,9 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) EXPORT_SYMBOL(lease_get_mtime); /** - * fcntl_getlease - Enquire what lease is currently active + * __fcntl_getlease - Enquire what lease is currently active * @filp: the file + * @flavor: type of lease flags to check * * The value returned by this function will be one of * (if no lease break is pending): @@ -1697,7 +1704,7 @@ EXPORT_SYMBOL(lease_get_mtime); * XXX: sfr & willy disagree over whether F_INPROGRESS * should be returned to userspace. */ -int fcntl_getlease(struct file *filp) +static int __fcntl_getlease(struct file *filp, unsigned int flavor) { struct file_lease *fl; struct inode *inode = file_inode(filp); @@ -1713,7 +1720,8 @@ int fcntl_getlease(struct file *filp) list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) { if (fl->c.flc_file != filp) continue; - type = target_leasetype(fl); + if (fl->c.flc_flags & flavor) + type = target_leasetype(fl); break; } spin_unlock(&ctx->flc_lock); @@ -1724,6 +1732,19 @@ int fcntl_getlease(struct file *filp) return type; } +int fcntl_getlease(struct file *filp) +{ + return __fcntl_getlease(filp, FL_LEASE); +} + +int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + deleg->d_type = __fcntl_getlease(filp, FL_DELEG); + return 0; +} + /** * check_conflicting_open - see if the given file points to an inode that has * an existing open that would conflict with the @@ -1929,11 +1950,19 @@ static int generic_delete_lease(struct file *filp, void *owner) int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { + struct inode *inode = file_inode(filp); + + if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EINVAL; + switch (arg) { case F_UNLCK: return generic_delete_lease(filp, *priv); - case F_RDLCK: case F_WRLCK: + if (S_ISDIR(inode->i_mode)) + return -EINVAL; + fallthrough; + case F_RDLCK: if (!(*flp)->fl_lmops->lm_break) { WARN_ON_ONCE(1); return -ENOLCK; @@ -2018,8 +2047,6 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE)) return -EACCES; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; error = security_file_lock(filp, arg); if (error) return error; @@ -2027,13 +2054,13 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg) { struct file_lease *fl; struct fasync_struct *new; int error; - fl = lease_alloc(filp, arg); + fl = lease_alloc(filp, flavor, arg); if (IS_ERR(fl)) return PTR_ERR(fl); @@ -2064,9 +2091,33 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) */ int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { + if (S_ISDIR(file_inode(filp)->i_mode)) + return -EINVAL; + if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); - return do_fcntl_add_lease(fd, filp, arg); + return do_fcntl_add_lease(fd, filp, FL_LEASE, arg); +} + +/** + * fcntl_setdeleg - sets a delegation on an open file + * @fd: open file descriptor + * @filp: file pointer + * @deleg: delegation request from userland + * + * Call this fcntl to establish a delegation on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + /* For now, no flags are supported */ + if (deleg->d_flags != 0 || deleg->__pad != 0) + return -EINVAL; + + if (deleg->d_type == F_UNLCK) + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); + return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type); } /** diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 32db676127a9..51ea9bdc813f 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -26,6 +26,22 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc); static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); +void __minix_error_inode(struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "minix-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); +} + static void minix_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); @@ -589,7 +605,7 @@ struct inode *minix_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; if (INODE_VERSION(inode) == MINIX_V1) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index d54273c3c9ff..2bfaf377f208 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -42,6 +42,9 @@ struct minix_sb_info { unsigned short s_version; }; +void __minix_error_inode(struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); + struct inode *minix_iget(struct super_block *, unsigned long); struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); @@ -168,4 +171,10 @@ static inline int minix_test_bit(int nr, const void *vaddr) #endif +#define minix_error_inode(inode, fmt, ...) \ + __minix_error_inode((inode), __func__, __LINE__, \ + (fmt), ##__VA_ARGS__) + +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + #endif /* FS_MINIX_H */ diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 8938536d8d3c..263e4ba8b1c8 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -145,6 +145,11 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) struct minix_dir_entry * de; int err; + if (inode->i_nlink == 0) { + minix_error_inode(inode, "inode has corrupted nlink"); + return -EFSCORRUPTED; + } + de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; @@ -161,15 +166,24 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) static int minix_rmdir(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); - int err = -ENOTEMPTY; + int err = -EFSCORRUPTED; - if (minix_empty_dir(inode)) { - err = minix_unlink(dir, dentry); - if (!err) { - inode_dec_link_count(dir); - inode_dec_link_count(inode); - } + if (dir->i_nlink <= 2) { + minix_error_inode(dir, "inode has corrupted nlink"); + goto out; + } + + err = -ENOTEMPTY; + if (!minix_empty_dir(inode)) + goto out; + + err = minix_unlink(dir, dentry); + if (!err) { + inode_dec_link_count(dir); + inode_dec_link_count(inode); } + +out: return err; } @@ -208,6 +222,17 @@ static int minix_rename(struct mnt_idmap *idmap, if (dir_de && !minix_empty_dir(new_inode)) goto out_dir; + err = -EFSCORRUPTED; + if (new_inode->i_nlink == 0 || (dir_de && new_inode->i_nlink != 2)) { + minix_error_inode(new_inode, "inode has corrupted nlink"); + goto out_dir; + } + + if (dir_de && old_dir->i_nlink <= 2) { + minix_error_inode(old_dir, "inode has corrupted nlink"); + goto out_dir; + } + err = -ENOENT; new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) diff --git a/fs/mount.h b/fs/mount.h index f13a28752d0b..2d28ef2a3aed 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -27,6 +27,7 @@ struct mnt_namespace { unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; refcount_t passive; /* number references not pinning @mounts */ + bool is_anon; } __randomize_layout; struct mnt_pcp { @@ -175,7 +176,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->ns.ns_id == 0; + return ns->is_anon; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..bf0f66f0e9b9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,7 +282,7 @@ void putname(struct filename *name) return; refcnt = atomic_read(&name->refcnt); - if (refcnt != 1) { + if (unlikely(refcnt != 1)) { if (WARN_ON_ONCE(!refcnt)) return; @@ -290,7 +290,7 @@ void putname(struct filename *name) return; } - if (name->name != name->iname) { + if (unlikely(name->name != name->iname)) { __putname(name->name); kfree(name); } else @@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ @@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap, if (unlikely(retval)) return retval; - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { /* * Nobody gets write access to an immutable file. */ @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap, } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -746,7 +785,8 @@ static void leave_rcu(struct nameidata *nd) static void terminate_walk(struct nameidata *nd) { - drop_links(nd); + if (unlikely(nd->depth)) + drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); @@ -843,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; @@ -878,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { @@ -951,8 +991,8 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->state & ND_ROOT_PRESET)) - if (!(nd->flags & LOOKUP_IS_SCOPED)) + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) @@ -1034,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd) } if (!nd->root.mnt) { int error = set_root(nd); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) { @@ -1632,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) + return 0; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; - if (!try_to_unlazy_next(nd, dentry)) + if (unlikely(!try_to_unlazy_next(nd, dentry))) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); @@ -1823,7 +1865,7 @@ again: return dentry; } -static struct dentry *lookup_slow(const struct qstr *name, +static noinline struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { @@ -1855,7 +1897,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); if (likely(!err)) return 0; @@ -1870,7 +1912,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, if (err != -ECHILD) // hard error return err; - return inode_permission(idmap, nd->inode, MAY_EXEC); + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } static int reserve_stack(struct nameidata *nd, struct path *link) @@ -1901,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; -static const char *pick_link(struct nameidata *nd, struct path *link, +static noinline const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link); + int error; + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + + error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); @@ -1981,14 +2033,15 @@ all_done: // pure jump * * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static const char *step_into(struct nameidata *nd, int flags, +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; - int err = handle_mounts(nd, dentry, &path); + int err; - if (err < 0) + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || @@ -2010,15 +2063,32 @@ static const char *step_into(struct nameidata *nd, int flags, nd->seq = nd->next_seq; return NULL; } - if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) + return pick_link(nd, &path, inode, flags); +} + +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) +{ + /* + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. + */ + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); - } else { - if (path.mnt == nd->path.mnt) - mntget(path.mnt); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; } - return pick_link(nd, &path, inode, flags); + return step_into_slowpath(nd, flags, dentry); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) @@ -2101,7 +2171,7 @@ static const char *handle_dots(struct nameidata *nd, int type) if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) @@ -2131,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type) return NULL; } -static const char *walk_component(struct nameidata *nd, int flags) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* @@ -2140,7 +2210,7 @@ static const char *walk_component(struct nameidata *nd, int flags) * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return handle_dots(nd, nd->last_type); } @@ -2152,7 +2222,7 @@ static const char *walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return ERR_CAST(dentry); } - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return step_into(nd, flags, dentry); } @@ -2505,7 +2575,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) { + if (likely(!depth)) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; @@ -2543,10 +2613,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ - if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) return ERR_PTR(-EAGAIN); - if (!*s) + if (unlikely(!*s)) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); @@ -2560,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (nd->state & ND_ROOT_PRESET) { + if (unlikely(nd->state & ND_ROOT_PRESET)) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2579,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ - if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); @@ -2632,7 +2702,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } /* For scoped-lookups we need to set the root to the dirfd as well. */ - if (flags & LOOKUP_IS_SCOPED) { + if (unlikely(flags & LOOKUP_IS_SCOPED)) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; @@ -2765,6 +2835,62 @@ static int filename_parentat(int dfd, struct filename *name, return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } +/** + * start_dirop - begin a create or remove dirop, performing locking and lookup + * @parent: the dentry of the parent in which the operation will occur + * @name: a qstr holding the name within that parent + * @lookup_flags: intent and other lookup flags. + * + * The lookup is performed and necessary locks are taken so that, on success, + * the returned dentry can be operated on safely. + * The qstr must already have the hash value calculated. + * + * Returns: a locked dentry, or an error. + * + */ +static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags, + unsigned int state) +{ + struct dentry *dentry; + struct inode *dir = d_inode(parent); + + if (state == TASK_KILLABLE) { + int ret = down_write_killable_nested(&dir->i_rwsem, + I_MUTEX_PARENT); + if (ret) + return ERR_PTR(ret); + } else { + inode_lock_nested(dir, I_MUTEX_PARENT); + } + dentry = lookup_one_qstr_excl(name, parent, lookup_flags); + if (IS_ERR(dentry)) + inode_unlock(dir); + return dentry; +} + +struct dentry *start_dirop(struct dentry *parent, struct qstr *name, + unsigned int lookup_flags) +{ + return __start_dirop(parent, name, lookup_flags, TASK_NORMAL); +} + +/** + * end_dirop - signal completion of a dirop + * @de: the dentry which was returned by start_dirop or similar. + * + * If the de is an error, nothing happens. Otherwise any lock taken to + * protect the dentry is dropped and the dentry itself is release (dput()). + */ +void end_dirop(struct dentry *de) +{ + if (!IS_ERR(de)) { + inode_unlock(de->d_parent->d_inode); + dput(de); + } +} +EXPORT_SYMBOL(end_dirop); + /* does lookup, returns the object with parent locked */ static struct dentry *__start_removing_path(int dfd, struct filename *name, struct path *path) @@ -2781,10 +2907,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return ERR_PTR(-EINVAL); /* don't fail immediately if it's r/o, at least try to report other errors */ error = mnt_want_write(parent_path.mnt); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + d = start_dirop(parent_path.dentry, &last, 0); if (IS_ERR(d)) - goto unlock; + goto drop; if (error) goto fail; path->dentry = no_free_ptr(parent_path.dentry); @@ -2792,10 +2917,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name, return d; fail: - dput(d); + end_dirop(d); d = ERR_PTR(error); -unlock: - inode_unlock(parent_path.dentry->d_inode); +drop: if (!error) mnt_drop_write(parent_path.mnt); return d; @@ -2910,7 +3034,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_noperm_common(struct qstr *qname, struct dentry *base) +int lookup_noperm_common(struct qstr *qname, struct dentry *base) { const char *name = qname->name; u32 len = qname->len; @@ -3181,6 +3305,234 @@ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, } EXPORT_SYMBOL(lookup_noperm_positive_unlocked); +/** + * start_creating - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup is performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned, so + * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail + * with -EEXIST. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating); + +/** + * start_removing - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing); + +/** + * start_creating_killable - prepare to create a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name already exists, a positive dentry is returned. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_creating_killable); + +/** + * start_removing_killable - prepare to remove a given name with permission checking + * @idmap: idmap of the mount + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. Permission checking (MAY_EXEC) is performed + * against @idmap. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * If a signal is received or was already pending, the function aborts + * with -EINTR; + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name) +{ + int err = lookup_one_common(idmap, name, parent); + + if (err) + return ERR_PTR(err); + return __start_dirop(parent, name, 0, TASK_KILLABLE); +} +EXPORT_SYMBOL(start_removing_killable); + +/** + * start_creating_noperm - prepare to create a given name without permission checking + * @parent: directory in which to prepare to create the name + * @name: the name to be created + * + * Locks are taken and a lookup in performed prior to creating + * an object in a directory. + * + * If the name already exists, a positive dentry is returned. + * + * Returns: a negative or positive dentry, or an error. + */ +struct dentry *start_creating_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, LOOKUP_CREATE); +} +EXPORT_SYMBOL(start_creating_noperm); + +/** + * start_removing_noperm - prepare to remove a given name without permission checking + * @parent: directory in which to find the name + * @name: the name to be removed + * + * Locks are taken and a lookup in performed prior to removing + * an object from a directory. + * + * If the name doesn't exist, an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: a positive dentry, or an error. + */ +struct dentry *start_removing_noperm(struct dentry *parent, + struct qstr *name) +{ + int err = lookup_noperm_common(name, parent); + + if (err) + return ERR_PTR(err); + return start_dirop(parent, name, 0); +} +EXPORT_SYMBOL(start_removing_noperm); + +/** + * start_creating_dentry - prepare to create a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and negative a reference is taken and + * returned. If not an error is returned. + * + * end_creating() should be called when creation is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_positive(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EEXIST); + } + return dget(child); +} +EXPORT_SYMBOL(start_creating_dentry); + +/** + * start_removing_dentry - prepare to remove a given dentry + * @parent: directory from which dentry should be removed + * @child: the dentry to be removed + * + * A lock is taken to protect the dentry again other dirops and + * the validity of the dentry is checked: correct parent and still hashed. + * + * If the dentry is valid and positive, a reference is taken and + * returned. If not an error is returned. + * + * end_removing() should be called when removal is complete, or aborted. + * + * Returns: the valid dentry, or an error. + */ +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child) +{ + inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); + if (unlikely(IS_DEADDIR(parent->d_inode) || + child->d_parent != parent || + d_unhashed(child))) { + inode_unlock(parent->d_inode); + return ERR_PTR(-EINVAL); + } + if (d_is_negative(child)) { + inode_unlock(parent->d_inode); + return ERR_PTR(-ENOENT); + } + return dget(child); +} +EXPORT_SYMBOL(start_removing_dentry); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -3419,6 +3771,290 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) EXPORT_SYMBOL(unlock_rename); /** + * __start_renaming - lookup and lock names for rename + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry and an extra ref is taken on @rd.old_parent. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs must have the hash calculated, and no permission + * checking is performed. + * + * Returns: zero or an error. + */ +static int +__start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d1, *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + trap = lock_rename(rd->old_parent, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + + d1 = lookup_one_qstr_excl(old_last, rd->old_parent, + lookup_flags); + err = PTR_ERR(d1); + if (IS_ERR(d1)) + goto out_unlock; + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_dput_d1; + + if (d1 == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = d1; + rd->new_dentry = d2; + dget(rd->old_parent); + return 0; + +out_dput_d2: + dput(d2); +out_dput_d1: + dput(d1); +out_unlock: + unlock_rename(rd->old_parent, rd->new_parent); + return err; +} + +/** + * start_renaming - lookup and lock names for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_last: name of object in @rd.old_parent + * @new_last: name of object in @rd.new_parent + * + * Look up two names and ensure locks are in place for + * rename. + * + * On success the found dentries are stored in @rd.old_dentry, + * @rd.new_dentry. Also the refcount on @rd->old_parent is increased. + * These references and the lock are dropped by end_renaming(). + * + * The passed in qstrs need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent); + if (err) + return err; + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming(rd, lookup_flags, old_last, new_last); +} +EXPORT_SYMBOL(start_renaming); + +static int +__start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + struct dentry *trap; + struct dentry *d2; + int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + int err; + + if (rd->flags & RENAME_EXCHANGE) + target_flags = 0; + if (rd->flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) { + /* dentry was removed, or moved and explicit parent requested */ + err = -EINVAL; + goto out_unlock; + } + + d2 = lookup_one_qstr_excl(new_last, rd->new_parent, + lookup_flags | target_flags); + err = PTR_ERR(d2); + if (IS_ERR(d2)) + goto out_unlock; + + if (old_dentry == trap) { + /* source is an ancestor of target */ + err = -EINVAL; + goto out_dput_d2; + } + + if (d2 == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_dput_d2; + } + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = d2; + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_dput_d2: + dput(d2); +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} + +/** + * start_renaming_dentry - lookup and lock name for rename with permission checking + * @rd: rename data containing parents and flags, and + * for receiving found dentries + * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL, + * LOOKUP_NO_SYMLINKS etc). + * @old_dentry: dentry of name to move + * @new_last: name of target in @rd.new_parent + * + * Look up target name and ensure locks are in place for + * rename. + * + * On success the found dentry is stored in @rd.new_dentry and + * @rd.old_parent is confirmed to be the parent of @old_dentry. If it + * was originally %NULL, it is set. In either case a reference is taken + * so that end_renaming() can have a stable reference to unlock. + * + * References and the lock can be dropped with end_renaming() + * + * The passed in qstr need not have the hash calculated, and basic + * eXecute permission checking is performed against @rd.mnt_idmap. + * + * Returns: zero or an error. + */ +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last) +{ + int err; + + err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent); + if (err) + return err; + return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last); +} +EXPORT_SYMBOL(start_renaming_dentry); + +/** + * start_renaming_two_dentries - Lock to dentries in given parents for rename + * @rd: rename data containing parent + * @old_dentry: dentry of name to move + * @new_dentry: dentry to move to + * + * Ensure locks are in place for rename and check parentage is still correct. + * + * On success the two dentries are stored in @rd.old_dentry and + * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to + * be the parents of the dentries. + * + * References and the lock can be dropped with end_renaming() + * + * Returns: zero or an error. + */ +int +start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry) +{ + struct dentry *trap; + int err; + + /* Already have the dentry - need to be sure to lock the correct parent */ + trap = lock_rename_child(old_dentry, rd->new_parent); + if (IS_ERR(trap)) + return PTR_ERR(trap); + err = -EINVAL; + if (d_unhashed(old_dentry) || + (rd->old_parent && rd->old_parent != old_dentry->d_parent)) + /* old_dentry was removed, or moved and explicit parent requested */ + goto out_unlock; + if (d_unhashed(new_dentry) || + rd->new_parent != new_dentry->d_parent) + /* new_dentry was removed or moved */ + goto out_unlock; + + if (old_dentry == trap) + /* source is an ancestor of target */ + goto out_unlock; + + if (new_dentry == trap) { + /* target is an ancestor of source */ + if (rd->flags & RENAME_EXCHANGE) + err = -EINVAL; + else + err = -ENOTEMPTY; + goto out_unlock; + } + + err = -EEXIST; + if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE)) + goto out_unlock; + + rd->old_dentry = dget(old_dentry); + rd->new_dentry = dget(new_dentry); + rd->old_parent = dget(old_dentry->d_parent); + return 0; + +out_unlock: + unlock_rename(old_dentry->d_parent, rd->new_parent); + return err; +} +EXPORT_SYMBOL(start_renaming_two_dentries); + +void end_renaming(struct renamedata *rd) +{ + unlock_rename(rd->old_parent, rd->new_parent); + dput(rd->old_dentry); + dput(rd->new_dentry); + dput(rd->old_parent); +} +EXPORT_SYMBOL(end_renaming); + +/** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode @@ -3461,10 +4097,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file - * @want_excl: whether the file must not yet exist + * @di: returns parent inode, if the inode is delegated. * * Create a new file. * @@ -3474,9 +4109,10 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ -int vfs_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool want_excl) +int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode, + struct delegated_inode *di) { + struct inode *dir = d_inode(dentry->d_parent); int error; error = may_create(idmap, dir, dentry); @@ -3490,7 +4126,10 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); + error = try_break_deleg(dir, di); + if (error) + return error; + error = dir->i_op->create(idmap, dir, dentry, mode, true); if (!error) fsnotify_create(dir, dentry); return error; @@ -3697,7 +4336,7 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, - bool got_write) + bool got_write, struct delegated_inode *delegated_inode) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; @@ -3786,6 +4425,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { + /* but break the directory lease first! */ + error = try_break_deleg(dir_inode, delegated_inode); + if (error) + goto out_dput; + file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { @@ -3848,6 +4492,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct delegated_inode delegated_inode = { }; struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; @@ -3879,7 +4524,7 @@ static const char *open_last_lookups(struct nameidata *nd, return ERR_PTR(-ECHILD); } } - +retry: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* @@ -3892,7 +4537,7 @@ static const char *open_last_lookups(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - dentry = lookup_open(nd, file, op, got_write); + dentry = lookup_open(nd, file, op, got_write, &delegated_inode); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); @@ -3907,8 +4552,16 @@ static const char *open_last_lookups(struct nameidata *nd, if (got_write) mnt_drop_write(nd->path.mnt); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (is_delegated(&delegated_inode)) { + int error = break_deleg_wait(&delegated_inode); + + if (!error) + goto retry; + return ERR_PTR(error); + } return ERR_CAST(dentry); + } if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); @@ -4036,7 +4689,7 @@ int vfs_tmpfile(struct mnt_idmap *idmap, inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); - inode->i_state |= I_LINKABLE; + inode_state_set(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); @@ -4223,21 +4876,18 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path->dentry, - reval_flag | create_flags); + dentry = start_dirop(path->dentry, &last, reval_flag | create_flags); if (IS_ERR(dentry)) - goto unlock; + goto out_drop_write; if (unlikely(error)) goto fail; return dentry; fail: - dput(dentry); + end_dirop(dentry); dentry = ERR_PTR(error); -unlock: - inode_unlock(path->dentry->d_inode); +out_drop_write: if (!error) mnt_drop_write(path->mnt); out: @@ -4256,11 +4906,20 @@ struct dentry *start_creating_path(int dfd, const char *pathname, } EXPORT_SYMBOL(start_creating_path); +/** + * end_creating_path - finish a code section started by start_creating_path() + * @path: the path instantiated by start_creating_path() + * @dentry: the dentry returned by start_creating_path() + * + * end_creating_path() will unlock and locks taken by start_creating_path() + * and drop an references that were taken. It should only be called + * if start_creating_path() returned a non-error. + * If vfs_mkdir() was called and it returned an error, that error *should* + * be passed to end_creating_path() together with the path. + */ void end_creating_path(const struct path *path, struct dentry *dentry) { - if (!IS_ERR(dentry)) - dput(dentry); - inode_unlock(path->dentry->d_inode); + end_creating(dentry); mnt_drop_write(path->mnt); path_put(path); } @@ -4278,13 +4937,15 @@ inline struct dentry *start_creating_user_path( } EXPORT_SYMBOL(start_creating_user_path); + /** * vfs_mknod - create device node or file - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child device node - * @mode: mode of the child device node - * @dev: device number of device to create + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child device node + * @mode: mode of the child device node + * @dev: device number of device to create + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a device node or file. * @@ -4295,7 +4956,8 @@ EXPORT_SYMBOL(start_creating_user_path); * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev, + struct delegated_inode *delegated_inode) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); @@ -4319,6 +4981,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); @@ -4346,6 +5012,7 @@ static int may_mknod(umode_t mode) static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { + struct delegated_inode di = { }; struct mnt_idmap *idmap; struct dentry *dentry; struct path path; @@ -4369,22 +5036,26 @@ retry: idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(idmap, path.dentry->d_inode, - dentry, mode, true); + error = vfs_create(idmap, dentry, mode, &di); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, new_decode_dev(dev)); + dentry, mode, new_decode_dev(dev), &di); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, - dentry, mode, 0); + dentry, mode, 0, &di); break; } out2: end_creating_path(&path, dentry); + if (is_delegated(&di)) { + error = break_deleg_wait(&di); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4407,10 +5078,11 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d /** * vfs_mkdir - create directory returning correct dentry if possible - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory - * @mode: mode of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @mode: mode of the child directory + * @delegated_inode: returns parent inode, if the inode is delegated. * * Create a directory. * @@ -4427,7 +5099,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) + struct dentry *dentry, umode_t mode, + struct delegated_inode *delegated_inode) { int error; unsigned max_links = dir->i_sb->s_max_links; @@ -4450,6 +5123,10 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (max_links && dir->i_nlink >= max_links) goto err; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto err; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); error = PTR_ERR(de); if (IS_ERR(de)) @@ -4462,7 +5139,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, return dentry; err: - dput(dentry); + end_creating(dentry); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -4473,6 +5150,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode) struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; + struct delegated_inode delegated_inode = { }; retry: dentry = filename_create(dfd, name, &path, lookup_flags); @@ -4484,11 +5162,16 @@ retry: mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, mode); + dentry, mode, &delegated_inode); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4510,9 +5193,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) /** * vfs_rmdir - remove directory - * @idmap: idmap of the mount the inode was found from - * @dir: inode of the parent directory - * @dentry: dentry of the child directory + * @idmap: idmap of the mount the inode was found from + * @dir: inode of the parent directory + * @dentry: dentry of the child directory + * @delegated_inode: returns parent inode, if it's delegated. * * Remove a directory. * @@ -4523,7 +5207,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry) + struct dentry *dentry, struct delegated_inode *delegated_inode) { int error = may_delete(idmap, dir, dentry, 1); @@ -4545,6 +5229,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, if (error) goto out; + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -4571,6 +5259,7 @@ int do_rmdir(int dfd, struct filename *name) struct qstr last; int type; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) @@ -4592,22 +5281,26 @@ retry: if (error) goto exit2; - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; error = security_path_rmdir(&path, dentry); if (error) goto exit4; - error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); + error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); exit4: - dput(dentry); + end_dirop(dentry); exit3: - inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4648,7 +5341,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, struct inode **delegated_inode) + struct dentry *dentry, struct delegated_inode *delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); @@ -4667,6 +5360,9 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(dir, delegated_inode); + if (error) + goto out; error = try_break_deleg(target, delegated_inode); if (error) goto out; @@ -4705,67 +5401,62 @@ int do_unlinkat(int dfd, struct filename *name) struct path path; struct qstr last; int type; - struct inode *inode = NULL; - struct inode *delegated_inode = NULL; + struct inode *inode; + struct delegated_inode delegated_inode = { }; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) - goto exit1; + goto exit_putname; error = -EISDIR; if (type != LAST_NORM) - goto exit2; + goto exit_path_put; error = mnt_want_write(path.mnt); if (error) - goto exit2; + goto exit_path_put; retry_deleg: - inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); + dentry = start_dirop(path.dentry, &last, lookup_flags); error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { + if (IS_ERR(dentry)) + goto exit_drop_write; - /* Why not before? Because we want correct error value */ - if (last.name[last.len]) - goto slashes; - inode = dentry->d_inode; - ihold(inode); - error = security_path_unlink(&path, dentry); - if (error) - goto exit3; - error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, &delegated_inode); -exit3: - dput(dentry); + /* Why not before? Because we want correct error value */ + if (unlikely(last.name[last.len])) { + if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; + end_dirop(dentry); + goto exit_drop_write; } - inode_unlock(path.dentry->d_inode); - if (inode) - iput(inode); /* truncate the inode here */ - inode = NULL; - if (delegated_inode) { + inode = dentry->d_inode; + ihold(inode); + error = security_path_unlink(&path, dentry); + if (error) + goto exit_end_dirop; + error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry, &delegated_inode); +exit_end_dirop: + end_dirop(dentry); + iput(inode); /* truncate the inode here */ + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } +exit_drop_write: mnt_drop_write(path.mnt); -exit2: +exit_path_put: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; - inode = NULL; goto retry; } -exit1: +exit_putname: putname(name); return error; - -slashes: - if (d_is_dir(dentry)) - error = -EISDIR; - else - error = -ENOTDIR; - goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) @@ -4789,6 +5480,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to + * @delegated_inode: returns victim inode, if the inode is delegated. * * Create a symlink. * @@ -4799,7 +5491,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *oldname) + struct dentry *dentry, const char *oldname, + struct delegated_inode *delegated_inode) { int error; @@ -4814,6 +5507,10 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; + error = try_break_deleg(dir, delegated_inode); + if (error) + return error; + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); @@ -4827,6 +5524,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to) struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; + struct delegated_inode delegated_inode = { }; if (IS_ERR(from)) { error = PTR_ERR(from); @@ -4841,8 +5539,13 @@ retry: error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, - dentry, from->name); + dentry, from->name, &delegated_inode); end_creating_path(&path, dentry); + if (is_delegated(&delegated_inode)) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry; + } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4892,7 +5595,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4931,19 +5634,21 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { - error = try_break_deleg(inode, delegated_inode); + error = try_break_deleg(dir, delegated_inode); + if (!error) + error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } - if (!error && (inode->i_state & I_LINKABLE)) { + if (!error && (inode_state_read_once(inode) & I_LINKABLE)) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_LINKABLE; + inode_state_clear(inode, I_LINKABLE); spin_unlock(&inode->i_lock); } inode_unlock(inode); @@ -4968,7 +5673,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int how = 0; int error; @@ -5012,7 +5717,7 @@ retry: new_dentry, &delegated_inode); out_dput: end_creating_path(&new_path, new_dentry); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); @@ -5098,7 +5803,7 @@ int vfs_rename(struct renamedata *rd) struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; - struct inode **delegated_inode = rd->delegated_inode; + struct delegated_inode *delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; @@ -5203,6 +5908,14 @@ int vfs_rename(struct renamedata *rd) old_dir->i_nlink >= max_links) goto out; } + error = try_break_deleg(old_dir, delegated_inode); + if (error) + goto out; + if (new_dir != old_dir) { + error = try_break_deleg(new_dir, delegated_inode); + if (error) + goto out; + } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) @@ -5256,14 +5969,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; - struct dentry *old_dentry, *new_dentry; - struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; - struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = - LOOKUP_RENAME_TARGET | LOOKUP_CREATE; + struct delegated_inode delegated_inode = { }; + unsigned int lookup_flags = 0; bool should_retry = false; int error = -EINVAL; @@ -5274,11 +5984,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, (flags & RENAME_EXCHANGE)) goto put_names; - if (flags & RENAME_EXCHANGE) - target_flags = 0; - if (flags & RENAME_NOREPLACE) - target_flags |= LOOKUP_EXCL; - retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); @@ -5308,68 +6013,42 @@ retry: goto exit2; retry_deleg: - trap = lock_rename(new_path.dentry, old_path.dentry); - if (IS_ERR(trap)) { - error = PTR_ERR(trap); + rd.old_parent = old_path.dentry; + rd.mnt_idmap = mnt_idmap(old_path.mnt); + rd.new_parent = new_path.dentry; + rd.delegated_inode = &delegated_inode; + rd.flags = flags; + + error = __start_renaming(&rd, lookup_flags, &old_last, &new_last); + if (error) goto exit_lock_rename; - } - old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, - lookup_flags); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_flags); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; if (flags & RENAME_EXCHANGE) { - if (!d_is_dir(new_dentry)) { + if (!d_is_dir(rd.new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!d_is_dir(old_dentry)) { + if (!d_is_dir(rd.old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) - goto exit5; + goto exit_unlock; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) - goto exit5; + goto exit_unlock; } - /* source should not be ancestor of target */ - error = -EINVAL; - if (old_dentry == trap) - goto exit5; - /* target should not be an ancestor of source */ - if (!(flags & RENAME_EXCHANGE)) - error = -ENOTEMPTY; - if (new_dentry == trap) - goto exit5; - error = security_path_rename(&old_path, old_dentry, - &new_path, new_dentry, flags); + error = security_path_rename(&old_path, rd.old_dentry, + &new_path, rd.new_dentry, flags); if (error) - goto exit5; + goto exit_unlock; - rd.old_parent = old_path.dentry; - rd.old_dentry = old_dentry; - rd.mnt_idmap = mnt_idmap(old_path.mnt); - rd.new_parent = new_path.dentry; - rd.new_dentry = new_dentry; - rd.delegated_inode = &delegated_inode; - rd.flags = flags; error = vfs_rename(&rd); -exit5: - dput(new_dentry); -exit4: - dput(old_dentry); -exit3: - unlock_rename(new_path.dentry, old_path.dentry); +exit_unlock: + end_renaming(&rd); exit_lock_rename: - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/namespace.c b/fs/namespace.c index d82910f33dc4..c58674a20cad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) -{ - struct ns_common *ns; - - if (!node) - return NULL; - ns = rb_entry(node, struct ns_common, ns_tree_node); - return container_of(ns, struct mnt_namespace, ns); -} - static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ @@ -151,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns) kfree(ns); } } -DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, + if (!IS_ERR(_T)) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { @@ -1345,26 +1336,12 @@ static void delayed_mntput(struct work_struct *unused) } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); -static void mntput_no_expire(struct mount *mnt) +static void noinline mntput_no_expire_slowpath(struct mount *mnt) { LIST_HEAD(list); int count; - rcu_read_lock(); - if (likely(READ_ONCE(mnt->mnt_ns))) { - /* - * Since we don't do lock_mount_hash() here, - * ->mnt_ns can change under us. However, if it's - * non-NULL, then there's a reference that won't - * be dropped until after an RCU delay done after - * turning ->mnt_ns NULL. So if we observe it - * non-NULL under rcu_read_lock(), the reference - * we are dropping is not the final one. - */ - mnt_add_count(mnt, -1); - rcu_read_unlock(); - return; - } + VFS_BUG_ON(mnt->mnt_ns); lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab @@ -1415,6 +1392,26 @@ static void mntput_no_expire(struct mount *mnt) cleanup_mnt(mnt); } +static void mntput_no_expire(struct mount *mnt) +{ + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); + rcu_read_unlock(); + return; + } + mntput_no_expire_slowpath(mnt); +} + void mntput(struct vfsmount *mnt) { if (mnt) { @@ -3103,19 +3100,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { - int fd; - struct file *file __free(fput) = NULL; - - file = vfs_open_tree(dfd, filename, flags); - if (IS_ERR(file)) - return PTR_ERR(file); - - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - fd_install(fd, no_free_ptr(file)); - return fd; + return FD_ADD(flags, vfs_open_tree(dfd, filename, flags)); } /* @@ -4093,8 +4078,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } - if (!anon) - ns_tree_gen_id(&new_ns->ns); + ns_tree_gen_id(new_ns); + + new_ns->is_anon = anon; refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); @@ -4283,10 +4269,10 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { + struct path new_path __free(path_put) = {}; struct mnt_namespace *ns; struct fs_context *fc; - struct file *file; - struct path newmount; + struct vfsmount *new_mnt; struct mount *mnt; unsigned int mnt_flags = 0; long ret; @@ -4324,35 +4310,36 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, fc = fd_file(f)->private_data; - ret = mutex_lock_interruptible(&fc->uapi_mutex); - if (ret < 0) + ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex); + ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex); + if (ret) return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) - goto err_unlock; + return ret; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { errorfcp(fc, "VFS", "Mount too revealing"); - goto err_unlock; + return ret; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) - goto err_unlock; + return ret; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); - newmount.mnt = vfs_create_mount(fc); - if (IS_ERR(newmount.mnt)) { - ret = PTR_ERR(newmount.mnt); - goto err_unlock; - } - newmount.dentry = dget(fc->root); - newmount.mnt->mnt_flags = mnt_flags; + new_mnt = vfs_create_mount(fc); + if (IS_ERR(new_mnt)) + return PTR_ERR(new_mnt); + new_mnt->mnt_flags = mnt_flags; + + new_path.dentry = dget(fc->root); + new_path.mnt = new_mnt; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to @@ -4362,38 +4349,27 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); - if (IS_ERR(ns)) { - ret = PTR_ERR(ns); - goto err_path; - } - mnt = real_mount(newmount.mnt); + if (IS_ERR(ns)) + return PTR_ERR(ns); + mnt = real_mount(new_path.mnt); ns->root = mnt; ns->nr_mounts = 1; mnt_add_to_ns(ns, mnt); - mntget(newmount.mnt); + mntget(new_path.mnt); - /* Attach to an apparent O_PATH fd with a note that we need to unmount - * it, not just simply put it. - */ - file = dentry_open(&newmount, O_PATH, fc->cred); - if (IS_ERR(file)) { - dissolve_on_fput(newmount.mnt); - ret = PTR_ERR(file); - goto err_path; + FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, + dentry_open(&new_path, O_PATH, fc->cred)); + if (fdf.err) { + dissolve_on_fput(new_path.mnt); + return fdf.err; } - file->f_mode |= FMODE_NEED_UNMOUNT; - - ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); - if (ret >= 0) - fd_install(ret, file); - else - fput(file); -err_path: - path_put(&newmount); -err_unlock: - mutex_unlock(&fc->uapi_mutex); - return ret; + /* + * Attach to an apparent O_PATH fd with a note that we + * need to unmount it, not just simply put it. + */ + fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT; + return fd_publish(fdf); } static inline int vfs_move_mount(const struct path *from_path, @@ -5035,19 +5011,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, unsigned, flags, struct mount_attr __user *, uattr, size_t, usize) { - struct file __free(fput) *file = NULL; - int fd; - if (!uattr && usize) return -EINVAL; - file = vfs_open_tree(dfd, filename, flags); - if (IS_ERR(file)) - return PTR_ERR(file); + FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags)); + if (fdf.err) + return fdf.err; if (uattr) { - int ret; struct mount_kattr kattr = {}; + struct file *file = fd_prepare_file(fdf); + int ret; if (flags & OPEN_TREE_CLONE) kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; @@ -5063,12 +5037,7 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, return ret; } - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - fd_install(fd, no_free_ptr(file)); - return fd; + return fd_publish(fdf); } int show_path(struct seq_file *m, struct dentry *root) @@ -5150,6 +5119,12 @@ static u64 mnt_to_propagation_flags(struct mount *m) return propagation; } +u64 vfsmount_to_propagation_flags(struct vfsmount *mnt) +{ + return mnt_to_propagation_flags(real_mount(mnt)); +} +EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags); + static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; @@ -5454,11 +5429,11 @@ static int statmount_string(struct kstatmount *s, u64 flag) ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: - sm->mnt_uidmap = start; + offp = &sm->mnt_uidmap; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: - sm->mnt_gidmap = start; + offp = &sm->mnt_gidmap; ret = statmount_mnt_gidmap(s, seq); break; default: @@ -5736,7 +5711,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->spare != 0) + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5753,16 +5728,14 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq { struct mnt_namespace *mnt_ns; - if (kreq->mnt_ns_id && kreq->spare) - return ERR_PTR(-EINVAL); - - if (kreq->mnt_ns_id) - return lookup_mnt_ns(kreq->mnt_ns_id); - - if (kreq->spare) { + if (kreq->mnt_ns_id) { + mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); + if (!mnt_ns) + return ERR_PTR(-ENOENT); + } else if (kreq->mnt_ns_fd) { struct ns_common *ns; - CLASS(fd, f)(kreq->spare); + CLASS(fd, f)(kreq->mnt_ns_fd); if (fd_empty(f)) return ERR_PTR(-EBADF); @@ -5774,11 +5747,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); + refcount_inc(&mnt_ns->passive); } else { mnt_ns = current->nsproxy->mnt_ns; + refcount_inc(&mnt_ns->passive); } - refcount_inc(&mnt_ns->passive); return mnt_ns; } @@ -5801,8 +5775,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + if (IS_ERR(ns)) + return PTR_ERR(ns); if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) @@ -5912,8 +5886,8 @@ static void __free_klistmount_free(const struct klistmount *kls) static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { - u64 last_mnt_id = kreq->param; + struct mnt_namespace *ns; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) @@ -5927,9 +5901,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req * if (!kls->kmnt_ids) return -ENOMEM; - kls->ns = grab_requested_mnt_ns(kreq); - if (!kls->ns) - return -ENOENT; + ns = grab_requested_mnt_ns(kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); + kls->ns = ns; kls->mnt_parent_id = kreq->mnt_id; return 0; @@ -5985,11 +5960,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, } struct mnt_namespace init_mnt_ns = { - .ns.inum = ns_init_inum(&init_mnt_ns), - .ns.ops = &mntns_operations, + .ns = NS_COMMON_INIT(init_mnt_ns), .user_ns = &init_user_ns, - .ns.__ns_ref = REFCOUNT_INIT(1), - .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 09394ac2c180..f9d62abef2ac 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -535,7 +535,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr folio_unlock(folio); err = filemap_fdatawrite_range(mapping, folio_pos(folio), - folio_pos(folio) + folio_size(folio)); + folio_next_pos(folio)); switch (err) { case 0: ret = VM_FAULT_RETRY; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 486166460e17..6df89c92b10b 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -147,10 +147,10 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) if (!fscache_cookie_valid(cookie)) return true; - if (!(inode->i_state & I_PINNING_NETFS_WB)) { + if (!(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); @@ -192,7 +192,7 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux) { struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); - if (inode->i_state & I_PINNING_NETFS_WB) { + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) { loff_t i_size = i_size_read(inode); fscache_unuse_cookie(cookie, aux, &i_size); } @@ -298,7 +298,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_dirty(folio)) return false; - end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); + end = umin(folio_next_pos(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index 5c0dc4efc792..8e6264f62a8f 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -36,12 +36,12 @@ void netfs_single_mark_inode_dirty(struct inode *inode) mark_inode_dirty(inode); - if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) { + if (caching && !(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) { bool need_use = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_PINNING_NETFS_WB)) { - inode->i_state |= I_PINNING_NETFS_WB; + if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) { + inode_state_set(inode, I_PINNING_NETFS_WB); need_use = true; } spin_unlock(&inode->i_lock); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e3dcc157a83..54699299d5b1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -338,6 +338,14 @@ again: /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; + if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) { + if (clp->cl_xprtsec.cert_serial != + data->xprtsec.cert_serial) + continue; + if (clp->cl_xprtsec.privkey_serial != + data->xprtsec.privkey_serial) + continue; + } refcount_inc(&clp->cl_count); return clp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 46d9c65d50f8..ea9f6ca8f30f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2268,11 +2268,12 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { - file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); - if (error) + if (!error) { + file->f_mode |= FMODE_CREATED; + return finish_open(file, dentry, NULL); + } else if (error != -EEXIST || open_flags & O_EXCL) return error; - return finish_open(file, dentry, NULL); } if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18b57c7c2f97..f76fe406937a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -475,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) goto out_no_inode; } - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; @@ -718,6 +718,8 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct nfs_fattr *fattr; loff_t oldsize = i_size_read(inode); int error = 0; + kuid_t task_uid = current_fsuid(); + kuid_t owner_uid = inode->i_uid; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -739,9 +741,11 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); if (attr->ia_valid & ATTR_MTIME_SET) { - nfs_set_timestamps_to_ts(inode, attr); - attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + if (uid_eq(task_uid, owner_uid)) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_timestamps(inode, attr->ia_valid); attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); @@ -751,10 +755,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { if (attr->ia_valid & ATTR_ATIME_SET) { - spin_lock(&inode->i_lock); - nfs_set_timestamps_to_ts(inode, attr); - spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + if (uid_eq(task_uid, owner_uid)) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } } else { nfs_update_delegated_atime(inode); attr->ia_valid &= ~ATTR_ATIME; diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 2c0455e91571..49ed90c6b9f2 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -42,10 +42,9 @@ struct nfs_local_kiocb { /* Begin mostly DIO-specific members */ size_t end_len; short int end_iter_index; - short int n_iters; + atomic_t n_iters; bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; - loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; - struct iov_iter iters[NFSLOCAL_MAX_IOS]; + struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned; /* End mostly DIO-specific members */ }; @@ -314,7 +313,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, init_sync_kiocb(&iocb->kiocb, file); iocb->hdr = hdr; + iocb->kiocb.ki_pos = hdr->args.offset; iocb->kiocb.ki_flags &= ~IOCB_APPEND; + iocb->kiocb.ki_complete = NULL; iocb->aio_complete_work = NULL; iocb->end_iter_index = -1; @@ -388,13 +389,24 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, return true; } +static void +nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec, + unsigned int nvecs, unsigned long total, + size_t start, size_t len) +{ + iov_iter_bvec(iter, rw, bvec, nvecs, total); + if (start) + iov_iter_advance(iter, start); + iov_iter_truncate(iter, len); +} + /* * Setup as many as 3 iov_iter based on extents described by @local_dio. * Returns the number of iov_iter that were setup. */ static int nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, - unsigned int nvecs, size_t len, + unsigned int nvecs, unsigned long total, struct nfs_local_dio *local_dio) { int n_iters = 0; @@ -402,39 +414,17 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, /* Setup misaligned start? */ if (local_dio->start_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iters[n_iters].count = local_dio->start_len; - iocb->offset[n_iters] = iocb->hdr->args.offset; - iocb->iter_is_dio_aligned[n_iters] = false; + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, 0, local_dio->start_len); ++n_iters; } - /* Setup misaligned end? - * If so, the end is purposely setup to be issued using buffered IO - * before the middle (which will use DIO, if DIO-aligned, with AIO). - * This creates problems if/when the end results in a partial write. - * So must save index and length of end to handle this corner case. - */ - if (local_dio->end_len) { - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - iocb->offset[n_iters] = local_dio->end_offset; - iov_iter_advance(&iters[n_iters], - local_dio->start_len + local_dio->middle_len); - iocb->iter_is_dio_aligned[n_iters] = false; - /* Save index and length of end */ - iocb->end_iter_index = n_iters; - iocb->end_len = local_dio->end_len; - ++n_iters; - } - - /* Setup DIO-aligned middle to be issued last, to allow for - * DIO with AIO completion (see nfs_local_call_{read,write}). + /* + * Setup DIO-aligned middle, if there is no misaligned end (below) + * then AIO completion is used, see nfs_local_call_{read,write} */ - iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); - if (local_dio->start_len) - iov_iter_advance(&iters[n_iters], local_dio->start_len); - iters[n_iters].count -= local_dio->end_len; - iocb->offset[n_iters] = local_dio->middle_offset; + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs, + total, local_dio->start_len, local_dio->middle_len); iocb->iter_is_dio_aligned[n_iters] = nfs_iov_iter_aligned_bvec(&iters[n_iters], @@ -442,12 +432,22 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { trace_nfs_local_dio_misaligned(iocb->hdr->inode, - iocb->hdr->args.offset, len, local_dio); + local_dio->start_len, local_dio->middle_len, local_dio); return 0; /* no DIO-aligned IO possible */ } + iocb->end_iter_index = n_iters; ++n_iters; - iocb->n_iters = n_iters; + /* Setup misaligned end? */ + if (local_dio->end_len) { + nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, + nvecs, total, local_dio->start_len + + local_dio->middle_len, local_dio->end_len); + iocb->end_iter_index = n_iters; + ++n_iters; + } + + atomic_set(&iocb->n_iters, n_iters); return n_iters; } @@ -473,18 +473,26 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) } len = hdr->args.count - total; + /* + * For each iocb, iocb->n_iters is always at least 1 and we always + * end io after first nfs_local_pgio_done call unless misaligned DIO. + */ + atomic_set(&iocb->n_iters, 1); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { struct nfs_local_dio local_dio; if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && - nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) { + /* Ensure DIO WRITE's IO on stable storage upon completion */ + if (rw == ITER_SOURCE) + iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC; return; /* is DIO-aligned */ + } } /* Use buffered IO */ - iocb->offset[0] = hdr->args.offset; iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); - iocb->n_iters = 1; } static void @@ -504,9 +512,11 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, hdr->task.tk_start = ktime_get(); } -static void -nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) +static bool +nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force) { + struct nfs_pgio_header *hdr = iocb->hdr; + /* Must handle partial completions */ if (status >= 0) { hdr->res.count += status; @@ -517,6 +527,12 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; } + + if (force) + return true; + + BUG_ON(atomic_read(&iocb->n_iters) <= 0); + return atomic_dec_and_test(&iocb->n_iters); } static void @@ -547,11 +563,11 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb) queue_work(nfsiod_workqueue, &iocb->work); } -static void -nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_read_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + long status = hdr->task.tk_status; if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ @@ -564,20 +580,27 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) */ hdr->res.replen = 0; - if (hdr->res.count != hdr->args.count || - hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) + /* nfs_readpage_result() handles short read */ + + if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; dprintk("%s: read %ld bytes eof %d.\n", __func__, status > 0 ? status : 0, hdr->res.eof); } +static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_read_done(iocb); + nfs_local_pgio_release(iocb); +} + static void nfs_local_read_aio_complete_work(struct work_struct *work) { struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_pgio_release(iocb); + nfs_local_read_iocb_done(iocb); } static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) @@ -585,43 +608,51 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_read_done(iocb, ret); + /* AIO completion of DIO read should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } -static void nfs_local_call_read(struct work_struct *work) +static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *filp) { - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - const struct cred *save_cred; + bool force_done = false; ssize_t status; + int n_iters; - save_cred = override_creds(filp->f_cred); - - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_pos = iocb->offset[i]; status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial read */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_read_iocb_done(iocb); break; + } } } +} - revert_creds(save_cred); +static void nfs_local_call_read(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; - if (status != -EIOCBQUEUED) { - nfs_local_read_done(iocb, status); - nfs_local_pgio_release(iocb); - } + scoped_with_creds(filp->f_cred) + do_nfs_local_call_read(iocb, filp); } static int @@ -736,11 +767,10 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb) fattr->du.nfs3.used = stat.blocks << 9; } -static void -nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) +static void nfs_local_write_done(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - struct inode *inode = hdr->inode; + long status = hdr->task.tk_status; dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); @@ -759,10 +789,17 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; /* record -ENOSPC in terms of nfs_local_pgio_done */ - nfs_local_pgio_done(hdr, status); + (void) nfs_local_pgio_done(iocb, status, true); } if (hdr->task.tk_status < 0) - nfs_reset_boot_verifier(inode); + nfs_reset_boot_verifier(hdr->inode); +} + +static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb) +{ + nfs_local_write_done(iocb); + nfs_local_vfs_getattr(iocb); + nfs_local_pgio_release(iocb); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -770,8 +807,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work) struct nfs_local_kiocb *iocb = container_of(work, struct nfs_local_kiocb, work); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); + nfs_local_write_iocb_done(iocb); } static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) @@ -779,75 +815,62 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); - nfs_local_pgio_done(iocb->hdr, ret); - nfs_local_write_done(iocb, ret); + /* AIO completion of DIO write should always be last to complete */ + if (unlikely(!nfs_local_pgio_done(iocb, ret, false))) + return; + nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } -static void nfs_local_call_write(struct work_struct *work) +static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb, + struct file *filp) { - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - unsigned long old_flags = current->flags; - const struct cred *save_cred; + bool force_done = false; ssize_t status; - - current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; - save_cred = override_creds(filp->f_cred); + int n_iters; file_start_write(filp); - for (int i = 0; i < iocb->n_iters ; i++) { + n_iters = atomic_read(&iocb->n_iters); + for (int i = 0; i < n_iters ; i++) { if (iocb->iter_is_dio_aligned[i]) { iocb->kiocb.ki_flags |= IOCB_DIRECT; - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } -retry: - iocb->kiocb.ki_pos = iocb->offset[i]; + /* Only use AIO completion if DIO-aligned segment is last */ + if (i == iocb->end_iter_index) { + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } + } else + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); if (status != -EIOCBQUEUED) { - if (unlikely(status >= 0 && status < iocb->iters[i].count)) { - /* partial write */ - if (i == iocb->end_iter_index) { - /* Must not account partial end, otherwise, due - * to end being issued before middle: the partial - * write accounting in nfs_local_write_done() - * would incorrectly advance hdr->args.offset - */ - status = 0; - } else { - /* Partial write at start or buffered middle, - * exit early. - */ - nfs_local_pgio_done(iocb->hdr, status); - break; - } - } else if (unlikely(status == -ENOTBLK && - (iocb->kiocb.ki_flags & IOCB_DIRECT))) { - /* VFS will return -ENOTBLK if DIO WRITE fails to - * invalidate the page cache. Retry using buffered IO. - */ - iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - iocb->kiocb.ki_complete = NULL; - iocb->aio_complete_work = NULL; - goto retry; - } - nfs_local_pgio_done(iocb->hdr, status); - if (iocb->hdr->task.tk_status) + if (unlikely(status >= 0 && status < iocb->iters[i].count)) + force_done = true; /* Partial write */ + if (nfs_local_pgio_done(iocb, status, force_done)) { + nfs_local_write_iocb_done(iocb); break; + } } } file_end_write(filp); - revert_creds(save_cred); - current->flags = old_flags; + return status; +} - if (status != -EIOCBQUEUED) { - nfs_local_write_done(iocb, status); - nfs_local_vfs_getattr(iocb); - nfs_local_pgio_release(iocb); - } +static void nfs_local_call_write(struct work_struct *work) +{ + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + unsigned long old_flags = current->flags; + ssize_t status; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + + scoped_with_creds(filp->f_cred) + status = do_nfs_local_call_write(iocb, filp); + + current->flags = old_flags; } static int diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 0d7310c1ee0c..5d97c1d38bb6 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -2,6 +2,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> +#include <net/handshake.h> #include "internal.h" #include "nfs3_fs.h" #include "netns.h" @@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_clp->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, }; @@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_clp->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) cl_init.nconnect = mds_clp->cl_nconnect; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5998d6bd8a4f..3a4baed993c9 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -11,6 +11,7 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <net/handshake.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -983,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, - .xprtsec = mds_srv->nfs_client->cl_xprtsec, + .xprtsec = { + .policy = RPC_XPRTSEC_NONE, + .cert_serial = TLS_NO_CERT, + .privkey_serial = TLS_NO_PRIVKEY, + }, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -992,9 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, cl_init.hostname = buf; switch (ds_proto) { + case XPRT_TRANSPORT_TCP_TLS: + if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec; + else + ds_proto = XPRT_TRANSPORT_TCP; + fallthrough; case XPRT_TRANSPORT_RDMA: case XPRT_TRANSPORT_TCP: - case XPRT_TRANSPORT_TCP_TLS: if (mds_clp->cl_nconnect > 1) { cl_init.nconnect = mds_clp->cl_nconnect; cl_init.max_connect = NFS_MAX_TRANSPORTS; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7f43e890d356..7317f26892c5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -431,6 +431,8 @@ void nfs42_ssc_unregister_ops(void) static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease, void **priv) { + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; return nfs4_proc_setlease(file, arg, lease, priv); } diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 00932500fce4..9e1c48c5c0b8 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { - const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; - saved_cred = override_creds(id_resolver_cache); - rkey = nfs_idmap_request_key(name, namelen, type, idmap); - revert_creds(saved_cred); - + scoped_with_creds(id_resolver_cache) + rkey = nfs_idmap_request_key(name, namelen, type, idmap); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 411776718494..93c6ce04332b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4715,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode, }; unsigned short task_flags = 0; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + if (server->flags & NFS_MOUNT_SOFTREVAL) task_flags |= RPC_TASK_TIMEOUT; + if (server->caps & NFS_CAP_MOVEABLE) + task_flags |= RPC_TASK_MOVEABLE; args.bitmask = nfs4_bitmask(server, fattr->label); nfs_fattr_init(fattr); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, task_flags); + status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3135b5af7ee..f157d43d1312 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -317,7 +317,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); /* Notify pnfs_destroy_layout_final() that we're done */ - if (inode->i_state & (I_FREEING | I_CLEAR)) + if (inode_state_read(inode) & (I_FREEING | I_CLEAR)) wake_up_var_locked(lo, &inode->i_lock); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7b32afb29782..9976cc16b689 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -809,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -834,27 +837,28 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, NULL); continue; } - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = get_v3_ds_connect(mds_srv, - &da->da_addr, - da->da_addrlen, da->da_transport, - timeo, retrans); + + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, + ds_proto, timeo, retrans); if (IS_ERR(clp)) continue; clp->cl_rpcclient->cl_softerr = 0; @@ -880,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs_client *mds_clp = mds_srv->nfs_client; + enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy; struct nfs4_pnfs_ds_addr *da; + int ds_proto; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -908,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; - if (da->da_transport != clp->cl_proto && - clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) - continue; - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) { + if (xprt_args.ident == XPRT_TRANSPORT_TCP && + clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) { struct sockaddr *addr = (struct sockaddr *)&da->da_addr; struct sockaddr_in *sin = @@ -944,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; xprt_args.servername = servername; } - if (da->da_addr.ss_family != clp->cl_addr.ss_family) + if (xprt_args.ident != clp->cl_proto) + continue; + if (xprt_args.dstaddr->sa_family != + clp->cl_addr.ss_family) continue; /** @@ -958,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (xprtdata.cred) put_cred(xprtdata.cred); } else { - if (da->da_transport == XPRT_TRANSPORT_TCP && - mds_srv->nfs_client->cl_proto == - XPRT_TRANSPORT_TCP_TLS) - da->da_transport = XPRT_TRANSPORT_TCP_TLS; - clp = nfs4_set_ds_client(mds_srv, - &da->da_addr, - da->da_addrlen, - da->da_transport, timeo, - retrans, minor_version); + ds_proto = da->da_transport; + if (ds_proto == XPRT_TRANSPORT_TCP && + xprtsec_policy != RPC_XPRTSEC_NONE) + ds_proto = XPRT_TRANSPORT_TCP_TLS; + + clp = nfs4_set_ds_client(mds_srv, &da->da_addr, + da->da_addrlen, ds_proto, + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -977,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = ERR_PTR(-EIO); continue; } - } } diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 545148d42dcc..ea6e6168092b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, return p; kobject_put(&p->kobject); + kobject_put(&p->nfs_net_kobj); } return NULL; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a238b6725008..93798575b807 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1086,7 +1086,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct auth_domain *client, struct svc_fh *fhp, unsigned int may_flags, struct file *file, - struct nfsd_file **pnf, bool want_gc) + umode_t type, bool want_gc, struct nfsd_file **pnf) { unsigned char need = may_flags & NFSD_FILE_MAY_MASK; struct nfsd_file *new, *nf; @@ -1097,13 +1097,13 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, int ret; retry: - if (rqstp) { - status = fh_verify(rqstp, fhp, S_IFREG, + if (rqstp) + status = fh_verify(rqstp, fhp, type, may_flags|NFSD_MAY_OWNER_OVERRIDE); - } else { - status = fh_verify_local(net, cred, client, fhp, S_IFREG, + else + status = fh_verify_local(net, cred, client, fhp, type, may_flags|NFSD_MAY_OWNER_OVERRIDE); - } + if (status != nfs_ok) return status; inode = d_inode(fhp->fh_dentry); @@ -1176,15 +1176,18 @@ out: open_file: trace_nfsd_file_alloc(nf); - nf->nf_mark = nfsd_file_mark_find_or_create(inode); - if (nf->nf_mark) { + + if (type == S_IFREG) + nf->nf_mark = nfsd_file_mark_find_or_create(inode); + + if (type != S_IFREG || nf->nf_mark) { if (file) { get_file(file); nf->nf_file = file; status = nfs_ok; trace_nfsd_file_opened(nf, status); } else { - ret = nfsd_open_verified(fhp, may_flags, &nf->nf_file); + ret = nfsd_open_verified(fhp, type, may_flags, &nf->nf_file); if (ret == -EOPENSTALE && stale_retry) { stale_retry = false; nfsd_file_unhash(nf); @@ -1246,7 +1249,7 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, NULL, pnf, true); + fhp, may_flags, NULL, S_IFREG, true, pnf); } /** @@ -1271,7 +1274,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, NULL, pnf, false); + fhp, may_flags, NULL, S_IFREG, false, pnf); } /** @@ -1314,8 +1317,8 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, const struct cred *save_cred = get_current_cred(); __be32 beres; - beres = nfsd_file_do_acquire(NULL, net, cred, client, - fhp, may_flags, NULL, pnf, false); + beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags, + NULL, S_IFREG, false, pnf); put_cred(revert_creds(save_cred)); return beres; } @@ -1344,7 +1347,33 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file **pnf) { return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, - fhp, may_flags, file, pnf, false); + fhp, may_flags, file, S_IFREG, false, pnf); +} + +/** + * nfsd_file_acquire_dir - Get a struct nfsd_file with an open directory + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @pnf: OUT: new or found "struct nfsd_file" object + * + * The nfsd_file_object returned by this API is reference-counted + * but not garbage-collected. The object is unhashed after the + * final nfsd_file_put(). This opens directories only, and only + * in O_RDONLY mode. + * + * Return values: + * %nfs_ok - @pnf points to an nfsd_file with its reference + * count boosted. + * + * On error, an nfsstat value in network byte order is returned. + */ +__be32 +nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf) +{ + return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, fhp, + NFSD_MAY_READ|NFSD_MAY_64BIT_COOKIE, + NULL, S_IFDIR, false, pnf); } /* diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index e3d6ca2b6030..b383dbc5b921 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -82,5 +82,7 @@ __be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_file_acquire_local(struct net *net, struct svc_cred *cred, struct auth_domain *client, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf); +__be32 nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file **pnf); int nfsd_file_cache_stats_show(struct seq_file *m, void *v); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b6d03e1ef5f7..42adc5461db0 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -281,14 +281,11 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(argp->name, argp->len), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -344,7 +341,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = fh_fill_pre_attrs(fhp); if (status != nfs_ok) goto out; - host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL); if (host_err < 0) { status = nfserrno(host_err); goto out; @@ -367,9 +364,8 @@ set_attr: status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs); out: - inode_unlock(inode); - if (child && !IS_ERR(child)) - dput(child); + end_creating(child); +out_write: fh_drop_write(fhp); return status; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7f7e6bb23a90..b74800917583 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -264,14 +264,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (is_create_with_attrs(open)) nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs); - inode_lock_nested(inode, I_MUTEX_PARENT); - - child = lookup_one(&nop_mnt_idmap, - &QSTR_LEN(open->op_fname, open->op_fnamelen), - parent); + child = start_creating(&nop_mnt_idmap, parent, + &QSTR_LEN(open->op_fname, open->op_fnamelen)); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); - goto out; + goto out_write; } if (d_really_is_negative(child)) { @@ -379,10 +376,9 @@ set_attr: if (attrs.na_aclerr) open->op_bmval[0] &= ~FATTR4_WORD0_ACL; out: - inode_unlock(inode); + end_creating(child); nfsd_attrs_free(&attrs); - if (child && !IS_ERR(child)) - dput(child); +out_write: fh_drop_write(fhp); return status; } @@ -2342,6 +2338,13 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, union nfsd4_op_u *u) { struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation; + struct nfs4_delegation *dd; + struct nfsd_file *nf; + __be32 status; + + status = nfsd_file_acquire_dir(rqstp, &cstate->current_fh, &nf); + if (status != nfs_ok) + return status; /* * RFC 8881, section 18.39.3 says: @@ -2355,7 +2358,20 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this * situation. */ - gdd->gddrnf_status = GDD4_UNAVAIL; + dd = nfsd_get_dir_deleg(cstate, gdd, nf); + nfsd_file_put(nf); + if (IS_ERR(dd)) { + int err = PTR_ERR(dd); + + if (err != -EAGAIN) + return nfserrno(err); + gdd->gddrnf_status = GDD4_UNAVAIL; + return nfs_ok; + } + + gdd->gddrnf_status = GDD4_OK; + memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid)); + nfs4_put_stid(&dd->dl_stid); return nfs_ok; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e2b9472e5c78..b39d4cbdfd35 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -195,13 +195,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) goto out_creds; dir = nn->rec_file->f_path.dentry; - /* lock the parent */ - inode_lock(d_inode(dir)); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir); + dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname)); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out_unlock; + goto out; } if (d_really_is_positive(dentry)) /* @@ -212,15 +210,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ - goto out_put; - dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + goto out_end; + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL); if (IS_ERR(dentry)) status = PTR_ERR(dentry); -out_put: - if (!status) - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); +out_end: + end_creating(dentry); +out: if (status == 0) { if (nn->in_grace) __nfsd4_create_reclaim_record_grace(clp, dname, @@ -328,20 +324,12 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name); dir = nn->rec_file->f_path.dentry; - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - goto out_unlock; - } - status = -ENOENT; - if (d_really_is_negative(dentry)) - goto out; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); -out: - dput(dentry); -out_unlock: - inode_unlock(d_inode(dir)); + dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name)); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL); + end_removing(dentry); return status; } @@ -427,7 +415,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); + status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c1b54322c412..6791fc239dbd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1542,7 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); - WARN_ON(!list_empty(&stid->sc_cp_list)); + if (!list_empty(&stid->sc_cp_list)) + nfs4_free_cpntf_statelist(stid->sc_client->net, stid); kmem_cache_free(stateid_slab, stid); } @@ -3486,7 +3487,20 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) struct nfsd4_slot *slot = resp->cstate.slot; unsigned int base; - dprintk("--> %s slot %p\n", __func__, slot); + /* + * RFC 5661 Section 2.10.6.1.2: + * + * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT + * modify the reply cache entry for the slot whenever an error is + * returned from SEQUENCE ... + * + * Because nfsd4_store_cache_entry is called only by + * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only + * when a SEQUENCE operation was part of the COMPOUND. + * nfs41_check_op_ordering() ensures SEQUENCE is the first op. + */ + if (resp->opcnt == 1 && resp->cstate.status != nfs_ok) + return; slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; @@ -4349,6 +4363,36 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, return true; } +/* + * Note that the response is constructed here both for the case + * of a new SEQUENCE request and for a replayed SEQUENCE request. + * We do not cache SEQUENCE responses as SEQUENCE is idempotent. + */ +static void nfsd4_construct_sequence_response(struct nfsd4_session *session, + struct nfsd4_sequence *seq) +{ + struct nfs4_client *clp = session->se_client; + + seq->maxslots_response = max(session->se_target_maxslots, + seq->maxslots); + seq->target_maxslots = session->se_target_maxslots; + + switch (clp->cl_cb_state) { + case NFSD4_CB_DOWN: + seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; + break; + case NFSD4_CB_FAULT: + seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; + break; + default: + seq->status_flags = 0; + } + if (!list_empty(&clp->cl_revoked)) + seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -4398,6 +4442,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("%s: slotid %d\n", __func__, seq->slotid); trace_nfsd_slot_seqid_sequence(clp, seq, slot); + + nfsd4_construct_sequence_response(session, seq); + status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags); if (status == nfserr_replay_cache) { status = nfserr_seq_misordered; @@ -4495,23 +4542,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: - seq->maxslots = max(session->se_target_maxslots, seq->maxslots); - seq->target_maxslots = session->se_target_maxslots; - - switch (clp->cl_cb_state) { - case NFSD4_CB_DOWN: - seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; - break; - case NFSD4_CB_FAULT: - seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT; - break; - default: - seq->status_flags = 0; - } - if (!list_empty(&clp->cl_revoked)) - seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; - if (atomic_read(&clp->cl_admin_revoked)) - seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) @@ -7829,7 +7859,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) + status = fh_verify(rqstp, &cstate->current_fh, 0, 0); + if (status) return status; status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn); @@ -9347,3 +9378,103 @@ out_status: nfs4_put_stid(&dp->dl_stid); return status; } + +/** + * nfsd_get_dir_deleg - attempt to get a directory delegation + * @cstate: compound state + * @gdd: GET_DIR_DELEGATION arg/resp structure + * @nf: nfsd_file opened on the directory + * + * Given a GET_DIR_DELEGATION request @gdd, attempt to acquire a delegation + * on the directory to which @nf refers. Note that this does not set up any + * sort of async notifications for the delegation. + */ +struct nfs4_delegation * +nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_delegation *dp; + struct file_lease *fl; + struct nfs4_file *fp, *rfp; + int status = 0; + + fp = nfsd4_alloc_file(); + if (!fp) + return ERR_PTR(-ENOMEM); + + nfsd4_file_init(&cstate->current_fh, fp); + + rfp = nfsd4_file_hash_insert(fp, &cstate->current_fh); + if (unlikely(!rfp)) { + put_nfs4_file(fp); + return ERR_PTR(-ENOMEM); + } + + if (rfp != fp) { + put_nfs4_file(fp); + fp = rfp; + } + + /* if this client already has one, return that it's unavailable */ + spin_lock(&state_lock); + spin_lock(&fp->fi_lock); + /* existing delegation? */ + if (nfs4_delegation_exists(clp, fp)) { + status = -EAGAIN; + } else if (!fp->fi_deleg_file) { + fp->fi_deleg_file = nfsd_file_get(nf); + fp->fi_delegees = 1; + } else { + ++fp->fi_delegees; + } + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); + + if (status) { + put_nfs4_file(fp); + return ERR_PTR(status); + } + + /* Try to set up the lease */ + status = -ENOMEM; + dp = alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ); + if (!dp) + goto out_delegees; + + fl = nfs4_alloc_init_lease(dp); + if (!fl) + goto out_put_stid; + + status = kernel_setlease(nf->nf_file, + fl->c.flc_type, &fl, NULL); + if (fl) + locks_free_lease(fl); + if (status) + goto out_put_stid; + + /* + * Now, try to hash it. This can fail if we race another nfsd task + * trying to set a delegation on the same file. If that happens, + * then just say UNAVAIL. + */ + spin_lock(&state_lock); + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); + spin_unlock(&state_lock); + + if (!status) + return dp; + + /* Something failed. Drop the lease and clean up the stid */ + kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); +out_put_stid: + nfs4_put_stid(&dp->dl_stid); +out_delegees: + put_deleg_file(fp); + return ERR_PTR(status); +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 6040a6145dad..67bb9c0b9fcb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5073,7 +5073,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; /* Note slotid's are numbered from zero: */ /* sr_highest_slotid */ - nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1); if (nfserr != nfs_ok) return nfserr; /* sr_target_highest_slotid */ @@ -5925,8 +5925,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) */ warn_on_nonidempotent_op(op); xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT); - } - if (so) { + } else if (so) { int len = xdr->buf->len - (op_status_offset + XDR_UNIT); so->so_replay.rp_status = op->status; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index f19320018639..b752433c3c2c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -458,6 +458,7 @@ enum { #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ FATTR4_WORD2_MODE_UMASK | \ + FATTR4_WORD2_CLONE_BLKSIZE | \ NFSD4_2_SECURITY_ATTRS | \ FATTR4_WORD2_XATTR_SUPPORT | \ FATTR4_WORD2_TIME_DELEG_ACCESS | \ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3eb724ec9566..ed85dd43da18 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, dentry); } - fhp->fh_dentry = dentry; - fhp->fh_export = exp; - switch (fhp->fh_maxsize) { case NFS4_FHSIZE: if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR) @@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net, goto out; } + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + return 0; out: exp_put(exp); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 8f71f5748c75..481e789a7697 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -306,18 +306,16 @@ nfsd_proc_create(struct svc_rqst *rqstp) goto done; } - inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len), - dirfhp->fh_dentry); + dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry, + &QSTR_LEN(argp->name, argp->len)); if (IS_ERR(dchild)) { resp->status = nfserrno(PTR_ERR(dchild)); - goto out_unlock; + goto out_write; } fh_init(newfhp, NFS_FHSIZE); resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); if (!resp->status && d_really_is_negative(dchild)) resp->status = nfserr_noent; - dput(dchild); if (resp->status) { if (resp->status != nfserr_noent) goto out_unlock; @@ -409,6 +407,9 @@ nfsd_proc_create(struct svc_rqst *rqstp) /* File doesn't exist. Create it and set attrs */ resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type, rdev, newfhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + goto out_write; } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); @@ -423,7 +424,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) } out_unlock: - inode_unlock(dirfhp->fh_dentry->d_inode); + end_creating(dchild); +out_write: fh_drop_write(dirfhp); done: fh_put(dirfhp); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 1e736f402426..b052c1effdc5 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -867,4 +867,9 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_delegation **pdp); + +struct nfsd4_get_dir_delegation; +struct nfs4_delegation *nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, + struct nfsd4_get_dir_delegation *gdd, + struct nfsd_file *nf); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9cb20d4aeab1..31cbf46b47b1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -959,15 +959,16 @@ retry: /** * nfsd_open_verified - Open a regular file for the filecache * @fhp: NFS filehandle of the file to open + * @type: S_IFMT inode type allowed (0 means any type is allowed) * @may_flags: internal permission flags * @filp: OUT: open "struct file *" * * Returns zero on success, or a negative errno value. */ int -nfsd_open_verified(struct svc_fh *fhp, int may_flags, struct file **filp) +nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { - return __nfsd_open(fhp, S_IFREG, may_flags, filp); + return __nfsd_open(fhp, type, may_flags, filp); } /* @@ -1159,7 +1160,7 @@ static int wait_for_concurrent_writes(struct file *file) dprintk("nfsd: write resume %d\n", task_pid_nr(current)); } - if (inode->i_state & I_DIRTY) { + if (inode_state_read_once(inode) & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); err = vfs_fsync(file, 0); } @@ -1521,7 +1522,7 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* The parent directory should already be locked: */ +/* The parent directory should already be locked - we will unlock */ __be32 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_attrs *attrs, @@ -1552,13 +1553,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, true); + host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL); if (IS_ERR(dchild)) { host_err = PTR_ERR(dchild); } else if (d_is_negative(dchild)) { @@ -1574,7 +1574,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFIFO: case S_IFSOCK: host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild, - iap->ia_mode, rdev); + iap->ia_mode, rdev, NULL); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", @@ -1587,8 +1587,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - if (!IS_ERR(dchild)) - dput(dchild); + if (!err) + fh_fill_post_attrs(fhp); + end_creating(dchild); return err; out_nfserr: @@ -1626,28 +1627,24 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err) return nfserrno(host_err); - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) { - err = nfserrno(host_err); - goto out_unlock; - } + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - /* - * We unconditionally drop our ref to dchild as fh_compose will have - * already grabbed its own ref for it. - */ - dput(dchild); if (err) goto out_unlock; err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); - fh_fill_post_attrs(fhp); + /* nfsd_create_locked() unlocked the parent */ + dput(dchild); + return err; + out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dchild); return err; } @@ -1733,28 +1730,26 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, } dentry = fhp->fh_dentry; - inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); if (IS_ERR(dnew)) { err = nfserrno(PTR_ERR(dnew)); - inode_unlock(dentry->d_inode); goto out_drop_write; } err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; - host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); + host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); out_unlock: - inode_unlock(dentry->d_inode); + end_creating(dnew); if (!err) err = nfserrno(commit_metadata(fhp)); - dput(dnew); - if (err==0) err = cerr; + if (!err) + err = cerr; out_drop_write: fh_drop_write(fhp); out: @@ -1809,32 +1804,31 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, ddir = ffhp->fh_dentry; dirp = d_inode(ddir); - inode_lock_nested(dirp, I_MUTEX_PARENT); + dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len)); - dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir); if (IS_ERR(dnew)) { host_err = PTR_ERR(dnew); - goto out_unlock; + goto out_drop_write; } dold = tfhp->fh_dentry; err = nfserr_noent; if (d_really_is_negative(dold)) - goto out_dput; + goto out_unlock; err = fh_fill_pre_attrs(ffhp); if (err != nfs_ok) - goto out_dput; + goto out_unlock; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); - inode_unlock(dirp); +out_unlock: + end_creating(dnew); if (!host_err) { host_err = commit_metadata(ffhp); if (!host_err) host_err = commit_metadata(tfhp); } - dput(dnew); out_drop_write: fh_drop_write(tfhp); if (host_err == -EBUSY) { @@ -1849,12 +1843,6 @@ out_drop_write: } out: return err != nfs_ok ? err : nfserrno(host_err); - -out_dput: - dput(dnew); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } static void @@ -1895,11 +1883,12 @@ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct svc_fh *tfhp, char *tname, int tlen) { - struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; + struct dentry *fdentry, *tdentry; int type = S_IFDIR; + struct renamedata rd = {}; __be32 err; int host_err; - bool close_cached = false; + struct dentry *close_cached; trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); @@ -1925,15 +1914,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; retry: + close_cached = NULL; host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); goto out; } - trap = lock_rename(tdentry, fdentry); - if (IS_ERR(trap)) { - err = nfserr_xdev; + rd.mnt_idmap = &nop_mnt_idmap; + rd.old_parent = fdentry; + rd.new_parent = tdentry; + + host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen), + &QSTR_LEN(tname, tlen)); + + if (host_err) { + err = nfserrno(host_err); goto out_want_write; } err = fh_fill_pre_attrs(ffhp); @@ -1943,48 +1939,23 @@ retry: if (err != nfs_ok) goto out_unlock; - odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry); - host_err = PTR_ERR(odentry); - if (IS_ERR(odentry)) - goto out_nfserr; + type = d_inode(rd.old_dentry)->i_mode & S_IFMT; + + if (d_inode(rd.new_dentry)) + type = d_inode(rd.new_dentry)->i_mode & S_IFMT; - host_err = -ENOENT; - if (d_really_is_negative(odentry)) - goto out_dput_old; - host_err = -EINVAL; - if (odentry == trap) - goto out_dput_old; - type = d_inode(odentry)->i_mode & S_IFMT; - - ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry); - host_err = PTR_ERR(ndentry); - if (IS_ERR(ndentry)) - goto out_dput_old; - if (d_inode(ndentry)) - type = d_inode(ndentry)->i_mode & S_IFMT; - host_err = -ENOTEMPTY; - if (ndentry == trap) - goto out_dput_new; - - if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && - nfsd_has_cached_files(ndentry)) { - close_cached = true; - goto out_dput_old; + if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && + nfsd_has_cached_files(rd.new_dentry)) { + close_cached = dget(rd.new_dentry); + goto out_unlock; } else { - struct renamedata rd = { - .mnt_idmap = &nop_mnt_idmap, - .old_parent = fdentry, - .old_dentry = odentry, - .new_parent = tdentry, - .new_dentry = ndentry, - }; int retries; for (retries = 1;;) { host_err = vfs_rename(&rd); if (host_err != -EAGAIN || !retries--) break; - if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry))) + if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry))) break; } if (!host_err) { @@ -1993,11 +1964,6 @@ retry: host_err = commit_metadata(ffhp); } } - out_dput_new: - dput(ndentry); - out_dput_old: - dput(odentry); - out_nfserr: if (host_err == -EBUSY) { /* * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME @@ -2016,7 +1982,7 @@ retry: fh_fill_post_attrs(tfhp); } out_unlock: - unlock_rename(tdentry, fdentry); + end_renaming(&rd); out_want_write: fh_drop_write(ffhp); @@ -2027,9 +1993,8 @@ out_want_write: * until this point and then reattempt the whole shebang. */ if (close_cached) { - close_cached = false; - nfsd_close_cached_files(ndentry); - dput(ndentry); + nfsd_close_cached_files(close_cached); + dput(close_cached); goto retry; } out: @@ -2054,7 +2019,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; - struct inode *rinode; + struct inode *rinode = NULL; __be32 err; int host_err; @@ -2073,24 +2038,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - inode_lock_nested(dirp, I_MUTEX_PARENT); - rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); + rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen)); + host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) - goto out_unlock; + goto out_drop_write; - if (d_really_is_negative(rdentry)) { - dput(rdentry); - host_err = -ENOENT; - goto out_unlock; - } - rinode = d_inode(rdentry); err = fh_fill_pre_attrs(fhp); if (err != nfs_ok) goto out_unlock; + rinode = d_inode(rdentry); + /* Prevent truncation until after locks dropped */ ihold(rinode); + if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; @@ -2108,14 +2070,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, break; } } else { - host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry); + host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL); } fh_fill_post_attrs(fhp); - inode_unlock(dirp); - if (!host_err) +out_unlock: + end_removing(rdentry); + if (!err && !host_err) host_err = commit_metadata(fhp); - dput(rdentry); iput(rinode); /* truncate the inode here */ out_drop_write: @@ -2133,9 +2095,6 @@ out_nfserr: } out: return err != nfs_ok ? err : nfserrno(host_err); -out_unlock: - inode_unlock(dirp); - goto out_drop_write; } /* diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 0c0292611c6d..09de48c50cbe 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -114,7 +114,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -int nfsd_open_verified(struct svc_fh *fhp, int may_flags, +int nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ee0570cbdd9e..1ce8e12ae335 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,8 +574,9 @@ struct nfsd4_sequence { struct nfs4_sessionid sessionid; /* request/response */ u32 seqid; /* request/response */ u32 slotid; /* request/response */ - u32 maxslots; /* request/response */ + u32 maxslots; /* request */ u32 cachethis; /* request */ + u32 maxslots_response; /* response */ u32 target_maxslots; /* response */ u32 status_flags; /* response */ }; diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index bcc7d76269ac..4bbdc832d7f2 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -1148,7 +1148,7 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; - if (!(cpfile->i_state & I_NEW)) + if (!(inode_state_read_once(cpfile) & I_NEW)) goto out; err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index c664daba56ae..674380837ab9 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -506,7 +506,7 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO); if (unlikely(!dat)) return -ENOMEM; - if (!(dat->i_state & I_NEW)) + if (!(inode_state_read_once(dat) & I_NEW)) goto out; err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di)); diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index c4cd4a4dedd0..99eb8a59009e 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -188,7 +188,7 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); if (unlikely(!ifile)) return -ENOMEM; - if (!(ifile->i_state & I_NEW)) + if (!(inode_state_read_once(ifile) & I_NEW)) goto out; err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 87ddde159f0c..51bde45d5865 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -365,7 +365,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) failed_after_creation: clear_nlink(inode); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); iput(inode); /* * raw_inode will be deleted through @@ -562,7 +562,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (!inode->i_nlink) { iput(inode); return ERR_PTR(-ESTALE); @@ -591,7 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = nilfs_init_gcinode(inode); @@ -631,7 +631,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!btnc_inode)) return -ENOMEM; - if (btnc_inode->i_state & I_NEW) { + if (inode_state_read_once(btnc_inode) & I_NEW) { nilfs_init_btnc_inode(btnc_inode); unlock_new_inode(btnc_inode); } @@ -686,7 +686,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode) nilfs_iget_set, &args); if (unlikely(!s_inode)) return ERR_PTR(-ENOMEM); - if (!(s_inode->i_state & I_NEW)) + if (!(inode_state_read_once(s_inode) & I_NEW)) return inode; NILFS_I(s_inode)->i_flags = 0; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f466daa39440..b7e3d91b6243 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/spinlock.h> #include <linux/blkdev.h> +#include <linux/fs_struct.h> #include <linux/nilfs2_api.h> #include <linux/nilfs2_ondisk.h> #include "the_nilfs.h" diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f15ca6fc400d..deee16bc9d4e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (sci->sc_task) { wake_up(&sci->sc_wait_daemon); - kthread_stop(sci->sc_task); + if (kthread_stop(sci->sc_task)) { + spin_lock(&sci->sc_state_lock); + sci->sc_task = NULL; + timer_shutdown_sync(&sci->sc_timer); + spin_unlock(&sci->sc_state_lock); + } } spin_lock(&sci->sc_state_lock); diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 330f269abedf..83f93337c01b 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1226,7 +1226,7 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO); if (unlikely(!sufile)) return -ENOMEM; - if (!(sufile->i_state & I_NEW)) + if (!(inode_state_read_once(sufile) & I_NEW)) goto out; err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui)); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 1dadda82cae5..d0b9b984002f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1597,16 +1597,20 @@ static struct hlist_head *fanotify_alloc_merge_hash(void) return hash; } +DEFINE_CLASS(fsnotify_group, + struct fsnotify_group *, + if (!IS_ERR_OR_NULL(_T)) fsnotify_destroy_group(_T), + fsnotify_alloc_group(ops, flags), + const struct fsnotify_ops *ops, int flags) + /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct user_namespace *user_ns = current_user_ns(); - struct fsnotify_group *group; int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; - struct file *file; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); @@ -1690,36 +1694,29 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; - /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_group(&fanotify_fsnotify_ops, + CLASS(fsnotify_group, group)(&fanotify_fsnotify_ops, FSNOTIFY_GROUP_USER); - if (IS_ERR(group)) { + /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ + if (IS_ERR(group)) return PTR_ERR(group); - } /* Enforce groups limits per user in all containing user ns */ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), UCOUNT_FANOTIFY_GROUPS); - if (!group->fanotify_data.ucounts) { - fd = -EMFILE; - goto out_destroy_group; - } + if (!group->fanotify_data.ucounts) + return -EMFILE; group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); group->user_ns = get_user_ns(user_ns); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); - if (!group->fanotify_data.merge_hash) { - fd = -ENOMEM; - goto out_destroy_group; - } + if (!group->fanotify_data.merge_hash) + return -ENOMEM; group->overflow_event = fanotify_alloc_overflow_event(); - if (unlikely(!group->overflow_event)) { - fd = -ENOMEM; - goto out_destroy_group; - } + if (unlikely(!group->overflow_event)) + return -ENOMEM; if (force_o_largefile()) event_f_flags |= O_LARGEFILE; @@ -1738,8 +1735,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->priority = FSNOTIFY_PRIO_PRE_CONTENT; break; default: - fd = -EINVAL; - goto out_destroy_group; + return -EINVAL; } BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); @@ -1750,27 +1746,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } if (flags & FAN_ENABLE_AUDIT) { - fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) - goto out_destroy_group; - } - - fd = get_unused_fd_flags(f_flags); - if (fd < 0) - goto out_destroy_group; - - file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, - f_flags, FMODE_NONOTIFY); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto out_destroy_group; + return -EPERM; } - fd_install(fd, file); - return fd; -out_destroy_group: - fsnotify_destroy_group(group); + fd = FD_ADD(f_flags, + anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, + group, f_flags, FMODE_NONOTIFY)); + if (fd >= 0) + retain_and_null_ptr(group); return fd; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 46bfc543f946..d27ff5e5f165 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -52,7 +52,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/nsfs.c b/fs/nsfs.c index 79b026a36fb6..bf27d5da91f1 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = { static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; + + __ns_ref_active_put(ns); clear_inode(inode); ns->ops->put(ns); } @@ -108,7 +110,6 @@ int ns_get_path(struct path *path, struct task_struct *task, int open_namespace(struct ns_common *ns) { struct path path __free(path_put) = {}; - struct file *f; int err; /* call first to consume reference */ @@ -116,16 +117,7 @@ int open_namespace(struct ns_common *ns) if (err < 0) return err; - CLASS(get_unused_fd, fd)(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&path, O_RDONLY, current_cred()); - if (IS_ERR(f)) - return PTR_ERR(f); - - fd_install(fd, f); - return take_fd(fd); + return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); } int open_related_ns(struct ns_common *ns, @@ -311,7 +303,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; struct path path __free(path_put) = {}; - struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); if (ns->ns_type != CLONE_NEWNS) @@ -330,28 +321,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (ret) return ret; - CLASS(get_unused_fd, fd)(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&path, O_RDONLY, current_cred()); - if (IS_ERR(f)) - return PTR_ERR(f); - - if (uinfo) { - /* - * If @uinfo is passed return all information about the - * mount namespace as well. - */ - ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); - if (ret) - return ret; - } - - /* Transfer reference of @f to caller's fdtable. */ - fd_install(fd, no_free_ptr(f)); - /* File descriptor is live so hand it off to the caller. */ - return take_fd(fd); + FD_PREPARE(fdf, O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred())); + if (fdf.err) + return fdf.err; + /* + * If @uinfo is passed return all information about the + * mount namespace as well. + */ + ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + if (ret) + return ret; + ret = fd_publish(fdf); + break; } default: ret = -ENOTTY; @@ -408,6 +389,7 @@ static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, .show_path = nsfs_show_path, + .drop_inode = inode_just_drop, }; static int nsfs_init_inode(struct inode *inode, void *data) @@ -418,6 +400,16 @@ static int nsfs_init_inode(struct inode *inode, void *data) inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; + + /* + * Bring the namespace subtree back to life if we have to. This + * can happen when e.g., all processes using a network namespace + * and all namespace files or namespace file bind-mounts have + * died but there are still sockets pinning it. The SIOCGSKNS + * ioctl on such a socket will resurrect the relevant namespace + * subtree. + */ + __ns_ref_active_get(ns); return 0; } @@ -458,6 +450,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return FILEID_NSFS; } +bool is_current_namespace(struct ns_common *ns) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + return current_in_namespace(to_cg_ns(ns)); +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + return current_in_namespace(to_ipc_ns(ns)); +#endif + case CLONE_NEWNS: + return current_in_namespace(to_mnt_ns(ns)); +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + return current_in_namespace(to_net_ns(ns)); +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + return current_in_namespace(to_pid_ns(ns)); +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + return current_in_namespace(to_time_ns(ns)); +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + return current_in_namespace(to_user_ns(ns)); +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + return current_in_namespace(to_uts_ns(ns)); +#endif + default: + VFS_WARN_ON_ONCE(true); + return false; + } +} + static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, int fh_len, int fh_type) { @@ -483,18 +514,35 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; } + if (!fid->ns_id) + return NULL; + /* Either both are set or both are unset. */ + if (!fid->ns_inum != !fid->ns_type) + return NULL; + scoped_guard(rcu) { ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); if (!ns) return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); - if (ns->inum != fid->ns_inum) + if (fid->ns_inum && (fid->ns_inum != ns->inum)) + return NULL; + if (fid->ns_type && (fid->ns_type != ns->ns_type)) return NULL; - if (!__ns_ref_get(ns)) + /* + * This is racy because we're not actually taking an + * active reference. IOW, it could happen that the + * namespace becomes inactive after this check. + * We don't care because nsfs_init_inode() will just + * resurrect the relevant namespace tree for us. If it + * has been active here we just allow it's resurrection. + * We could try to take an active reference here and + * then drop it again. But really, why bother. + */ + if (!ns_get_unless_inactive(ns)) return NULL; } @@ -590,6 +638,8 @@ static int nsfs_init_fs_context(struct fs_context *fc) struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; + ctx->s_d_flags |= DCACHE_DONTCACHE; ctx->ops = &nsfs_ops; ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; @@ -612,3 +662,27 @@ void __init nsfs_init(void) nsfs_root_path.mnt = nsfs_mnt; nsfs_root_path.dentry = nsfs_mnt->mnt_root; } + +void nsproxy_ns_active_get(struct nsproxy *ns) +{ + ns_ref_active_get(ns->mnt_ns); + ns_ref_active_get(ns->uts_ns); + ns_ref_active_get(ns->ipc_ns); + ns_ref_active_get(ns->pid_ns_for_children); + ns_ref_active_get(ns->cgroup_ns); + ns_ref_active_get(ns->net_ns); + ns_ref_active_get(ns->time_ns); + ns_ref_active_get(ns->time_ns_for_children); +} + +void nsproxy_ns_active_put(struct nsproxy *ns) +{ + ns_ref_active_put(ns->mnt_ns); + ns_ref_active_put(ns->uts_ns); + ns_ref_active_put(ns->ipc_ns); + ns_ref_active_put(ns->pid_ns_for_children); + ns_ref_active_put(ns->cgroup_ns); + ns_ref_active_put(ns->net_ns); + ns_ref_active_put(ns->time_ns); + ns_ref_active_put(ns->time_ns_for_children); +} diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 3959f23c487a..08266adc42ba 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -537,7 +537,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, return ERR_PTR(-ENOMEM); /* If this is a freshly allocated inode, need to read it now. */ - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { /* diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index ddff94c091b8..8d09dfec970a 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -51,6 +51,7 @@ #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/log2.h> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 62464d194da3..af1e2cedb217 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/fs_struct.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 162711cc5b20..b267ec580da9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start, ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1, &phys); - start = folio_next_index(folio) << PAGE_SHIFT; + start = folio_next_pos(folio); } out: if (folios) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 92a6149da9c1..619ff03b15d6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2487,7 +2487,7 @@ update: * which hasn't been populated yet, so clear the refresh flag * and let the caller handle it. */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { status = 0; if (lockres) ocfs2_complete_lock_res_refresh(lockres, 0); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index fcc89856ab95..78f81950c9ee 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -152,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, mlog_errno(PTR_ERR(inode)); goto bail; } - trace_ocfs2_iget5_locked(inode->i_state); - if (inode->i_state & I_NEW) { + trace_ocfs2_iget5_locked(inode_state_read_once(inode)); + if (inode_state_read_once(inode) & I_NEW) { rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } @@ -1290,6 +1290,8 @@ static void ocfs2_clear_inode(struct inode *inode) void ocfs2_evict_inode(struct inode *inode) { + write_inode_now(inode, 1); + if (!inode->i_nlink || (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { ocfs2_delete_inode(inode); @@ -1299,27 +1301,6 @@ void ocfs2_evict_inode(struct inode *inode) ocfs2_clear_inode(inode); } -/* Called under inode_lock, with no more references on the - * struct inode, so it's safe here to check the flags field - * and to manipulate i_nlink without any other locks. */ -int ocfs2_drop_inode(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, - inode->i_nlink, oi->ip_flags); - - assert_spin_locked(&inode->i_lock); - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode->i_lock); - write_inode_now(inode, 1); - spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_NEW); - inode->i_state &= ~I_WILL_FREE; - - return 1; -} - /* * This is called from our getattr. */ diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index accf03d4765e..07bd838e7843 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) } void ocfs2_evict_inode(struct inode *inode); -int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e5f58ff2175f..85239807dec7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -902,15 +902,8 @@ bail: static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { - struct address_space *mapping = jinode->i_vfs_inode->i_mapping; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = mapping->nrpages * 2, - .range_start = jinode->i_dirty_start, - .range_end = jinode->i_dirty_end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); + return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, + jinode->i_dirty_start, jinode->i_dirty_end); } int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 54ed1495de9a..4b32fb5658ad 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1569,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode); DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode); -DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode); - TRACE_EVENT(ocfs2_inode_revalidate, TP_PROTO(void *inode, unsigned long long ino, unsigned int flags), diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 53daa4482406..2c7ba1480f7a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -129,7 +129,7 @@ static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .free_inode = ocfs2_free_inode, - .drop_inode = ocfs2_drop_inode, + .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 135c49c5d848..701ed85d9831 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -14,6 +14,7 @@ #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> +#include <linux/fs_struct.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "omfs.h" @@ -212,7 +213,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); diff --git a/fs/open.c b/fs/open.c index 3d64372ecc67..f328622061c5 100644 --- a/fs/open.c +++ b/fs/open.c @@ -191,12 +191,9 @@ int do_ftruncate(struct file *file, loff_t length, int small) if (error) return error; - sb_start_write(inode->i_sb); - error = do_truncate(file_mnt_idmap(file), dentry, length, - ATTR_MTIME | ATTR_CTIME, file); - sb_end_write(inode->i_sb); - - return error; + scoped_guard(super_write, inode->i_sb) + return do_truncate(file_mnt_idmap(file), dentry, length, + ATTR_MTIME | ATTR_CTIME, file); } int do_sys_ftruncate(unsigned int fd, loff_t length, int small) @@ -631,7 +628,7 @@ out: int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; struct iattr newattrs; int error; @@ -651,7 +648,7 @@ retry_deleg: &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -756,7 +753,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group) struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; struct iattr newattrs; kuid_t uid; @@ -791,7 +788,7 @@ retry_deleg: error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -940,7 +937,7 @@ static int do_dentry_open(struct file *f, } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; /* @@ -950,11 +947,11 @@ static int do_dentry_open(struct file *f, * pseudo file, this call will not change the mode. */ error = fsnotify_open_perm_and_set_mode(f); - if (error) + if (unlikely(error)) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); - if (error) + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ @@ -1171,9 +1168,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, if (IS_ERR(f)) return f; - error = vfs_create(mnt_idmap(path->mnt), - d_inode(path->dentry->d_parent), - path->dentry, mode, true); + error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL); if (!error) error = vfs_open(path, f); @@ -1421,8 +1416,8 @@ static int do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; - struct filename *tmp; - int err, fd; + struct filename *tmp __free(putname) = NULL; + int err; err = build_open_flags(how, &op); if (unlikely(err)) @@ -1432,18 +1427,7 @@ static int do_sys_openat2(int dfd, const char __user *filename, if (IS_ERR(tmp)) return PTR_ERR(tmp); - fd = get_unused_fd_flags(how->flags); - if (likely(fd >= 0)) { - struct file *f = do_filp_open(dfd, tmp, &op); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fd_install(fd, f); - } - } - putname(tmp); - return fd; + return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op)); } int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 26ecda0e4d19..fb8d84bdedfb 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -236,7 +236,7 @@ found: mutex_unlock(&op_mutex); if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { simple_inode_init_ts(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a01400cd41fd..d7275990ffa4 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags) gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - flags = generic_update_time(inode, flags); + + flags = inode_update_timestamps(inode, flags); + memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; @@ -1041,7 +1043,7 @@ struct inode *orangefs_iget(struct super_block *sb, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 0fdceb00ca07..9ab1119ebd28 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -247,7 +247,7 @@ again: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); @@ -281,13 +281,13 @@ again2: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { + orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); goto again2; } - if (inode->i_state & I_DIRTY_PAGES) { + if (inode_state_read(inode) & I_DIRTY_PAGES) { ret = 0; goto out_unlock; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 604a82acd164..758611ee4475 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -523,8 +523,8 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *indexdir = ovl_indexdir(dentry->d_sb); - struct dentry *index = NULL; struct dentry *temp = NULL; + struct renamedata rd = {}; struct qstr name = { }; int err; @@ -556,17 +556,15 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, if (err) goto out; - err = ovl_parent_lock(indexdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = indexdir; + rd.new_parent = indexdir; + err = start_renaming_dentry(&rd, 0, temp, &name); if (err) goto out; - index = ovl_lookup_upper(ofs, name.name, indexdir, name.len); - if (IS_ERR(index)) { - err = PTR_ERR(index); - } else { - err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0); - dput(index); - } - ovl_parent_unlock(indexdir); + + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); out: if (err) ovl_cleanup(ofs, indexdir, temp); @@ -613,9 +611,9 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, - c->dentry->d_name.len); + upper = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(c->dentry->d_name.name, + c->dentry->d_name.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); @@ -626,9 +624,8 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } - dput(upper); + end_creating(upper); } - inode_unlock(udir); if (err) goto out; @@ -727,34 +724,33 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } -struct ovl_cu_creds { - const struct cred *old; - struct cred *new; -}; - -static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc) +static const struct cred *ovl_prepare_copy_up_creds(struct dentry *dentry) { + struct cred *copy_up_cred = NULL; int err; - cc->old = cc->new = NULL; - err = security_inode_copy_up(dentry, &cc->new); + err = security_inode_copy_up(dentry, ©_up_cred); if (err < 0) - return err; + return ERR_PTR(err); - if (cc->new) - cc->old = override_creds(cc->new); + if (!copy_up_cred) + return NULL; - return 0; + return override_creds(copy_up_cred); } -static void ovl_revert_cu_creds(struct ovl_cu_creds *cc) +static void ovl_revert_copy_up_creds(const struct cred *orig_cred) { - if (cc->new) { - revert_creds(cc->old); - put_cred(cc->new); - } + const struct cred *copy_up_cred; + + copy_up_cred = revert_creds(orig_cred); + put_cred(copy_up_cred); } +DEFINE_CLASS(copy_up_creds, const struct cred *, + if (!IS_ERR_OR_NULL(_T)) ovl_revert_copy_up_creds(_T), + ovl_prepare_copy_up_creds(dentry), struct dentry *dentry) + /* * Copyup using workdir to prepare temp file. Used when copying up directories, * special files or when upper fs doesn't support O_TMPFILE. @@ -764,8 +760,8 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *inode; struct path path = { .mnt = ovl_upper_mnt(ofs) }; - struct dentry *temp, *upper, *trap; - struct ovl_cu_creds cc; + struct renamedata rd = {}; + struct dentry *temp; int err; struct ovl_cattr cattr = { /* Can't properly set mode on creation because of the umask */ @@ -774,14 +770,14 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - err = ovl_prep_cu_creds(c->dentry, &cc); - if (err) - return err; + scoped_class(copy_up_creds, copy_up_creds, c->dentry) { + if (IS_ERR(copy_up_creds)) + return PTR_ERR(copy_up_creds); - ovl_start_write(c->dentry); - temp = ovl_create_temp(ofs, c->workdir, &cattr); - ovl_end_write(c->dentry); - ovl_revert_cu_creds(&cc); + ovl_start_write(c->dentry); + temp = ovl_create_temp(ofs, c->workdir, &cattr); + ovl_end_write(c->dentry); + } if (IS_ERR(temp)) return PTR_ERR(temp); @@ -808,29 +804,24 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) * ovl_copy_up_data(), so lock workdir and destdir and make sure that * temp wasn't moved before copy up completion or cleanup. */ - trap = lock_rename(c->workdir, c->destdir); - if (trap || temp->d_parent != c->workdir) { - /* temp or workdir moved underneath us? abort without cleanup */ - dput(temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = c->workdir; + rd.new_parent = c->destdir; + rd.flags = 0; + err = start_renaming_dentry(&rd, 0, temp, + &QSTR_LEN(c->destname.name, c->destname.len)); + if (err) { + /* temp or workdir moved underneath us? map to -EIO */ err = -EIO; - if (!IS_ERR(trap)) - unlock_rename(c->workdir, c->destdir); - goto out; } - - err = ovl_copy_up_metadata(c, temp); if (err) - goto cleanup; + goto cleanup_unlocked; - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto cleanup; + err = ovl_copy_up_metadata(c, temp); + if (!err) + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); - err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0); - unlock_rename(c->workdir, c->destdir); - dput(upper); if (err) goto cleanup_unlocked; @@ -851,8 +842,6 @@ out: return err; -cleanup: - unlock_rename(c->workdir, c->destdir); cleanup_unlocked: ovl_cleanup(ofs, c->workdir, temp); dput(temp); @@ -866,17 +855,17 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) struct inode *udir = d_inode(c->destdir); struct dentry *temp, *upper; struct file *tmpfile; - struct ovl_cu_creds cc; int err; - err = ovl_prep_cu_creds(c->dentry, &cc); - if (err) - return err; + scoped_class(copy_up_creds, copy_up_creds, c->dentry) { + if (IS_ERR(copy_up_creds)) + return PTR_ERR(copy_up_creds); + + ovl_start_write(c->dentry); + tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_end_write(c->dentry); + } - ovl_start_write(c->dentry); - tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); - ovl_end_write(c->dentry); - ovl_revert_cu_creds(&cc); if (IS_ERR(tmpfile)) return PTR_ERR(tmpfile); @@ -894,16 +883,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) goto out; - inode_lock_nested(udir, I_MUTEX_PARENT); - - upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir, - c->destname.len); + upper = ovl_start_creating_upper(ofs, c->destdir, + &QSTR_LEN(c->destname.name, + c->destname.len)); err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, temp, udir, upper); - dput(upper); + end_creating(upper); } - inode_unlock(udir); if (err) goto out; @@ -1214,7 +1201,6 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, static int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; - const struct cred *old_cred; bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED); /* @@ -1234,7 +1220,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) if (err) return err; - old_cred = ovl_override_creds(dentry->d_sb); while (!err) { struct dentry *next; struct dentry *parent = NULL; @@ -1254,12 +1239,12 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) next = parent; } - err = ovl_copy_up_one(parent, next, flags); + with_ovl_creds(dentry->d_sb) + err = ovl_copy_up_one(parent, next, flags); dput(parent); dput(next); } - ovl_revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index a5e9ddf3023b..06b860b9ded6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -47,79 +47,70 @@ static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir, int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *wdentry) { - int err; - - err = ovl_parent_lock(workdir, wdentry); - if (err) - return err; + wdentry = start_removing_dentry(workdir, wdentry); + if (IS_ERR(wdentry)) + return PTR_ERR(wdentry); ovl_cleanup_locked(ofs, workdir->d_inode, wdentry); - ovl_parent_unlock(workdir); + end_removing(wdentry); return 0; } -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir) +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]) { - struct dentry *temp; - char name[20]; static atomic_t temp_id = ATOMIC_INIT(0); /* counter is allowed to wrap, since temp dentries are ephemeral */ - snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); + snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id)); +} - temp = ovl_lookup_upper(ofs, name, workdir, strlen(name)); - if (!IS_ERR(temp) && temp->d_inode) { - pr_err("workdir/%s already exists\n", name); - dput(temp); - temp = ERR_PTR(-EIO); - } +static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs, + struct dentry *workdir) +{ + char name[OVL_TEMPNAME_SIZE]; - return temp; + ovl_tempname(name); + return start_creating(ovl_upper_mnt_idmap(ofs), workdir, + &QSTR(name)); } static struct dentry *ovl_whiteout(struct ovl_fs *ofs) { int err; - struct dentry *whiteout; + struct dentry *whiteout, *link; struct dentry *workdir = ofs->workdir; struct inode *wdir = workdir->d_inode; guard(mutex)(&ofs->whiteout_lock); if (!ofs->whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_whiteout(ofs, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); + whiteout = ovl_start_creating_temp(ofs, workdir); if (IS_ERR(whiteout)) return whiteout; - ofs->whiteout = whiteout; + err = ovl_do_whiteout(ofs, wdir, whiteout); + if (!err) + ofs->whiteout = dget(whiteout); + end_creating(whiteout); + if (err) + return ERR_PTR(err); } if (!ofs->no_shared_whiteout) { - inode_lock_nested(wdir, I_MUTEX_PARENT); - whiteout = ovl_lookup_temp(ofs, workdir); - if (!IS_ERR(whiteout)) { - err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout); - if (err) { - dput(whiteout); - whiteout = ERR_PTR(err); - } - } - inode_unlock(wdir); - if (!IS_ERR(whiteout)) + link = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(link)) + return link; + err = ovl_do_link(ofs, ofs->whiteout, wdir, link); + if (!err) + whiteout = dget(link); + end_creating(link); + if (!err) return whiteout; - if (PTR_ERR(whiteout) != -EMLINK) { - pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n", + + if (err != -EMLINK) { + pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%u)\n", ofs->whiteout->d_inode->i_nlink, - PTR_ERR(whiteout)); + err); ofs->no_shared_whiteout = true; } } @@ -132,6 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, struct dentry *dentry) { struct dentry *whiteout; + struct renamedata rd = {}; int err; int flags = 0; @@ -143,10 +135,14 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir, if (d_is_dir(dentry)) flags = RENAME_EXCHANGE; - err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = ofs->workdir; + rd.new_parent = dir; + rd.flags = flags; + err = start_renaming_two_dentries(&rd, whiteout, dentry); if (!err) { - err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags); - unlock_rename(ofs->workdir, dir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); } if (err) goto kill_whiteout; @@ -191,7 +187,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) { pr_warn_ratelimited("wrong inherited casefold (%pd2)\n", newdentry); - dput(newdentry); + end_creating(newdentry); err = -EINVAL; } break; @@ -241,8 +237,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, } out: if (err) { - if (!IS_ERR(newdentry)) - dput(newdentry); + end_creating(newdentry); return ERR_PTR(err); } return newdentry; @@ -252,11 +247,11 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { struct dentry *ret; - inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT); - ret = ovl_create_real(ofs, workdir, - ovl_lookup_temp(ofs, workdir), attr); - inode_unlock(workdir->d_inode); - return ret; + ret = ovl_start_creating_temp(ofs, workdir); + if (IS_ERR(ret)) + return ret; + ret = ovl_create_real(ofs, workdir, ret, attr); + return end_creating_keep(ret); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -354,18 +349,19 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); - struct inode *udir = upperdir->d_inode; struct dentry *newdentry; int err; - inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = ovl_create_real(ofs, upperdir, - ovl_lookup_upper(ofs, dentry->d_name.name, - upperdir, dentry->d_name.len), - attr); - inode_unlock(udir); + newdentry = ovl_start_creating_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); if (IS_ERR(newdentry)) return PTR_ERR(newdentry); + newdentry = ovl_create_real(ofs, upperdir, newdentry, attr); + if (IS_ERR(newdentry)) + return PTR_ERR(newdentry); + + end_creating_keep(newdentry); if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && !ovl_allow_offline_changes(ofs)) { @@ -391,6 +387,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct path upperpath; struct dentry *upper; struct dentry *opaquedir; @@ -416,7 +413,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (IS_ERR(opaquedir)) goto out; - err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = RENAME_EXCHANGE; + err = start_renaming_two_dentries(&rd, opaquedir, upper); if (err) goto out_cleanup_unlocked; @@ -434,8 +435,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; @@ -448,7 +449,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, return opaquedir; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, opaquedir); dput(opaquedir); @@ -471,6 +472,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *workdir = ovl_workdir(dentry); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct renamedata rd = {}; struct dentry *upper; struct dentry *newdentry; int err; @@ -502,7 +504,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_dput; - err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = upperdir; + rd.flags = 0; + err = start_renaming_two_dentries(&rd, newdentry, upper); if (err) goto out_cleanup_unlocked; @@ -539,16 +545,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, - RENAME_EXCHANGE); - unlock_rename(workdir, upperdir); + rd.flags = RENAME_EXCHANGE; + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; ovl_cleanup(ofs, workdir, upper); } else { - err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0); - unlock_rename(workdir, upperdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) goto out_cleanup_unlocked; } @@ -568,66 +574,76 @@ out: return err; out_cleanup: - unlock_rename(workdir, upperdir); + end_renaming(&rd); out_cleanup_unlocked: ovl_cleanup(ofs, workdir, newdentry); dput(newdentry); goto out_dput; } -static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry, - struct inode *inode, - umode_t mode, - const struct cred *old_cred) +static const struct cred *ovl_override_creator_creds(struct dentry *dentry, struct inode *inode, umode_t mode) { int err; - struct cred *override_cred; - override_cred = prepare_creds(); + if (WARN_ON_ONCE(current->cred != ovl_creds(dentry->d_sb))) + return ERR_PTR(-EINVAL); + + CLASS(prepare_creds, override_cred)(); if (!override_cred) return ERR_PTR(-ENOMEM); override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; + err = security_dentry_create_files_as(dentry, mode, &dentry->d_name, - old_cred, override_cred); - if (err) { - put_cred(override_cred); + current->cred, override_cred); + if (err) return ERR_PTR(err); - } - /* - * Caller is going to match this with revert_creds() and drop - * referenec on the returned creds. - * We must be called with creator creds already, otherwise we risk - * leaking creds. - */ - old_cred = override_creds(override_cred); - WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb)); + return override_creds(no_free_ptr(override_cred)); +} + +static void ovl_revert_creator_creds(const struct cred *old_cred) +{ + const struct cred *override_cred; - return override_cred; + override_cred = revert_creds(old_cred); + put_cred(override_cred); +} + +DEFINE_CLASS(ovl_override_creator_creds, + const struct cred *, + if (!IS_ERR_OR_NULL(_T)) ovl_revert_creator_creds(_T), + ovl_override_creator_creds(dentry, inode, mode), + struct dentry *dentry, struct inode *inode, umode_t mode) + +static int ovl_create_handle_whiteouts(struct dentry *dentry, + struct inode *inode, + struct ovl_cattr *attr) +{ + if (!ovl_dentry_is_whiteout(dentry)) + return ovl_create_upper(dentry, inode, attr); + + return ovl_create_over_whiteout(dentry, inode, attr); } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr, bool origin) { int err; - const struct cred *old_cred, *new_cred = NULL; struct dentry *parent = dentry->d_parent; - old_cred = ovl_override_creds(dentry->d_sb); - - /* - * When linking a file with copy up origin into a new parent, mark the - * new parent dir "impure". - */ - if (origin) { - err = ovl_set_impure(parent, ovl_dentry_upper(parent)); - if (err) - goto out_revert_creds; - } + with_ovl_creds(dentry->d_sb) { + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + return err; + } - if (!attr->hardlink) { /* * In the creation cases(create, mkdir, mknod, symlink), * ovl should transfer current's fs{u,g}id to underlying @@ -641,23 +657,16 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, * create a new inode, so just use the ovl mounter's * fs{u,g}id. */ - new_cred = ovl_setup_cred_for_create(dentry, inode, attr->mode, - old_cred); - err = PTR_ERR(new_cred); - if (IS_ERR(new_cred)) { - new_cred = NULL; - goto out_revert_creds; - } - } - if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr); - else - err = ovl_create_over_whiteout(dentry, inode, attr); + if (attr->hardlink) + return ovl_create_handle_whiteouts(dentry, inode, attr); -out_revert_creds: - ovl_revert_creds(old_cred); - put_cred(new_cred); + scoped_class(ovl_override_creator_creds, cred, dentry, inode, attr->mode) { + if (IS_ERR(cred)) + return PTR_ERR(cred); + return ovl_create_handle_whiteouts(dentry, inode, attr); + } + } return err; } @@ -686,7 +695,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, goto out_drop_write; spin_lock(&inode->i_lock); - inode->i_state |= I_CREATING; + inode_state_set(inode, I_CREATING); spin_unlock(&inode->i_lock); inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode); @@ -733,14 +742,8 @@ static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir, static int ovl_set_link_redirect(struct dentry *dentry) { - const struct cred *old_cred; - int err; - - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_set_redirect(dentry, false); - ovl_revert_creds(old_cred); - - return err; + with_ovl_creds(dentry->d_sb) + return ovl_set_redirect(dentry, false); } static int ovl_link(struct dentry *old, struct inode *newdir, @@ -850,17 +853,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out; } - inode_lock_nested(dir, I_MUTEX_PARENT); - upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir, - dentry->d_name.len); + upper = ovl_start_removing_upper(ofs, upperdir, + &QSTR_LEN(dentry->d_name.name, + dentry->d_name.len)); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_unlock; + goto out_dput; err = -ESTALE; if ((opaquedir && upper != opaquedir) || (!opaquedir && !ovl_matches_upper(dentry, upper))) - goto out_dput_upper; + goto out_unlock; if (is_dir) err = ovl_do_rmdir(ofs, dir, upper); @@ -876,10 +879,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, */ if (!err) d_drop(dentry); -out_dput_upper: - dput(upper); out_unlock: - inode_unlock(dir); + end_removing(upper); +out_dput: dput(opaquedir); out: return err; @@ -916,7 +918,6 @@ static void ovl_drop_nlink(struct dentry *dentry) static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; - const struct cred *old_cred; bool lower_positive = ovl_lower_positive(dentry); LIST_HEAD(list); @@ -935,12 +936,12 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (!lower_positive) - err = ovl_remove_upper(dentry, is_dir, &list); - else - err = ovl_remove_and_whiteout(dentry, &list); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) { + if (!lower_positive) + err = ovl_remove_upper(dentry, is_dir, &list); + else + err = ovl_remove_and_whiteout(dentry, &list); + } if (!err) { if (is_dir) clear_nlink(dentry->d_inode); @@ -1104,102 +1105,107 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, - struct dentry *old, struct inode *newdir, - struct dentry *new, unsigned int flags) +struct ovl_renamedata { + struct renamedata; + struct dentry *opaquedir; + bool cleanup_whiteout; + bool update_nlink; + bool overwrite; +}; + +static int ovl_rename_start(struct ovl_renamedata *ovlrd, struct list_head *list) { - int err; - struct dentry *old_upperdir; - struct dentry *new_upperdir; - struct dentry *olddentry = NULL; - struct dentry *newdentry = NULL; - struct dentry *trap, *de; - bool old_opaque; - bool new_opaque; - bool cleanup_whiteout = false; - bool update_nlink = false; - bool overwrite = !(flags & RENAME_EXCHANGE); + struct dentry *old = ovlrd->old_dentry; + struct dentry *new = ovlrd->new_dentry; bool is_dir = d_is_dir(old); bool new_is_dir = d_is_dir(new); - bool samedir = olddir == newdir; - struct dentry *opaquedir = NULL; - const struct cred *old_cred = NULL; - struct ovl_fs *ofs = OVL_FS(old->d_sb); - LIST_HEAD(list); + int err; - err = -EINVAL; - if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) - goto out; + if (ovlrd->flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + return -EINVAL; - flags &= ~RENAME_NOREPLACE; + ovlrd->flags &= ~RENAME_NOREPLACE; /* Don't copy up directory trees */ err = -EXDEV; if (!ovl_can_move(old)) - goto out; - if (!overwrite && !ovl_can_move(new)) - goto out; + return err; + if (!ovlrd->overwrite && !ovl_can_move(new)) + return err; - if (overwrite && new_is_dir && !ovl_pure_upper(new)) { - err = ovl_check_empty_dir(new, &list); + if (ovlrd->overwrite && new_is_dir && !ovl_pure_upper(new)) { + err = ovl_check_empty_dir(new, list); if (err) - goto out; + return err; } - if (overwrite) { + if (ovlrd->overwrite) { if (ovl_lower_positive(old)) { if (!ovl_dentry_is_whiteout(new)) { /* Whiteout source */ - flags |= RENAME_WHITEOUT; + ovlrd->flags |= RENAME_WHITEOUT; } else { /* Switch whiteouts */ - flags |= RENAME_EXCHANGE; + ovlrd->flags |= RENAME_EXCHANGE; } } else if (is_dir && ovl_dentry_is_whiteout(new)) { - flags |= RENAME_EXCHANGE; - cleanup_whiteout = true; + ovlrd->flags |= RENAME_EXCHANGE; + ovlrd->cleanup_whiteout = true; } } err = ovl_copy_up(old); if (err) - goto out; + return err; err = ovl_copy_up(new->d_parent); if (err) - goto out; - if (!overwrite) { + return err; + + if (!ovlrd->overwrite) { err = ovl_copy_up(new); if (err) - goto out; + return err; } else if (d_inode(new)) { err = ovl_nlink_start(new); if (err) - goto out; + return err; - update_nlink = true; + ovlrd->update_nlink = true; } - if (!update_nlink) { + if (!ovlrd->update_nlink) { /* ovl_nlink_start() took ovl_want_write() */ err = ovl_want_write(old); if (err) - goto out; + return err; } - old_cred = ovl_override_creds(old->d_sb); + return 0; +} - if (!list_empty(&list)) { - opaquedir = ovl_clear_empty(new, &list); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) { - opaquedir = NULL; - goto out_revert_creds; - } - } +static int ovl_rename_upper(struct ovl_renamedata *ovlrd, struct list_head *list) +{ + struct dentry *old = ovlrd->old_dentry; + struct dentry *new = ovlrd->new_dentry; + struct ovl_fs *ofs = OVL_FS(old->d_sb); + struct dentry *old_upperdir = ovl_dentry_upper(old->d_parent); + struct dentry *new_upperdir = ovl_dentry_upper(new->d_parent); + bool is_dir = d_is_dir(old); + bool new_is_dir = d_is_dir(new); + bool samedir = old->d_parent == new->d_parent; + struct renamedata rd = {}; + struct dentry *de; + struct dentry *whiteout = NULL; + bool old_opaque, new_opaque; + int err; - old_upperdir = ovl_dentry_upper(old->d_parent); - new_upperdir = ovl_dentry_upper(new->d_parent); + if (!list_empty(list)) { + de = ovl_clear_empty(new, list); + if (IS_ERR(de)) + return PTR_ERR(de); + ovlrd->opaquedir = de; + } if (!samedir) { /* @@ -1211,95 +1217,88 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (ovl_type_origin(old)) { err = ovl_set_impure(new->d_parent, new_upperdir); if (err) - goto out_revert_creds; + return err; } - if (!overwrite && ovl_type_origin(new)) { + if (!ovlrd->overwrite && ovl_type_origin(new)) { err = ovl_set_impure(old->d_parent, old_upperdir); if (err) - goto out_revert_creds; + return err; } } - trap = lock_rename(new_upperdir, old_upperdir); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); - goto out_revert_creds; - } + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = old_upperdir; + rd.new_parent = new_upperdir; + rd.flags = ovlrd->flags; - de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir, - old->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) - goto out_unlock; - olddentry = de; + err = start_renaming(&rd, 0, + &QSTR_LEN(old->d_name.name, old->d_name.len), + &QSTR_LEN(new->d_name.name, new->d_name.len)); + if (err) + return err; err = -ESTALE; - if (!ovl_matches_upper(old, olddentry)) + if (!ovl_matches_upper(old, rd.old_dentry)) goto out_unlock; - de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir, - new->d_name.len); - err = PTR_ERR(de); - if (IS_ERR(de)) - goto out_unlock; - newdentry = de; - old_opaque = ovl_dentry_is_opaque(old); new_opaque = ovl_dentry_is_opaque(new); err = -ESTALE; if (d_inode(new) && ovl_dentry_upper(new)) { - if (opaquedir) { - if (newdentry != opaquedir) + if (ovlrd->opaquedir) { + if (rd.new_dentry != ovlrd->opaquedir) goto out_unlock; } else { - if (!ovl_matches_upper(new, newdentry)) + if (!ovl_matches_upper(new, rd.new_dentry)) goto out_unlock; } } else { - if (!d_is_negative(newdentry)) { - if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) + if (!d_is_negative(rd.new_dentry)) { + if (!new_opaque || !ovl_upper_is_whiteout(ofs, rd.new_dentry)) goto out_unlock; } else { - if (flags & RENAME_EXCHANGE) + if (ovlrd->flags & RENAME_EXCHANGE) goto out_unlock; } } - if (olddentry == trap) - goto out_unlock; - if (newdentry == trap) - goto out_unlock; - - if (olddentry->d_inode == newdentry->d_inode) + if (rd.old_dentry->d_inode == rd.new_dentry->d_inode) goto out_unlock; err = 0; if (ovl_type_merge_or_lower(old)) err = ovl_set_redirect(old, samedir); else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent)) - err = ovl_set_opaque_xerr(old, olddentry, -EXDEV); + err = ovl_set_opaque_xerr(old, rd.old_dentry, -EXDEV); if (err) goto out_unlock; - if (!overwrite && ovl_type_merge_or_lower(new)) + if (!ovlrd->overwrite && ovl_type_merge_or_lower(new)) err = ovl_set_redirect(new, samedir); - else if (!overwrite && new_is_dir && !new_opaque && + else if (!ovlrd->overwrite && new_is_dir && !new_opaque && ovl_type_merge(old->d_parent)) - err = ovl_set_opaque_xerr(new, newdentry, -EXDEV); + err = ovl_set_opaque_xerr(new, rd.new_dentry, -EXDEV); if (err) goto out_unlock; - err = ovl_do_rename(ofs, old_upperdir, olddentry, - new_upperdir, newdentry, flags); - unlock_rename(new_upperdir, old_upperdir); + err = ovl_do_rename_rd(&rd); + + if (!err && ovlrd->cleanup_whiteout) + whiteout = dget(rd.new_dentry); + +out_unlock: + end_renaming(&rd); + if (err) - goto out_revert_creds; + return err; - if (cleanup_whiteout) - ovl_cleanup(ofs, old_upperdir, newdentry); + if (whiteout) { + ovl_cleanup(ofs, old_upperdir, whiteout); + dput(whiteout); + } - if (overwrite && d_inode(new)) { + if (ovlrd->overwrite && d_inode(new)) { if (new_is_dir) clear_nlink(d_inode(new)); else @@ -1307,7 +1306,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } ovl_dir_modified(old->d_parent, ovl_type_origin(old) || - (!overwrite && ovl_type_origin(new))); + (!ovlrd->overwrite && ovl_type_origin(new))); ovl_dir_modified(new->d_parent, ovl_type_origin(old) || (d_inode(new) && ovl_type_origin(new))); @@ -1316,28 +1315,47 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, if (d_inode(new) && ovl_dentry_upper(new)) ovl_copyattr(d_inode(new)); -out_revert_creds: - ovl_revert_creds(old_cred); - if (update_nlink) - ovl_nlink_end(new); + return err; +} + +static void ovl_rename_end(struct ovl_renamedata *ovlrd) +{ + if (ovlrd->update_nlink) + ovl_nlink_end(ovlrd->new_dentry); else - ovl_drop_write(old); -out: - dput(newdentry); - dput(olddentry); - dput(opaquedir); + ovl_drop_write(ovlrd->old_dentry); +} + +static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, + struct dentry *old, struct inode *newdir, + struct dentry *new, unsigned int flags) +{ + struct ovl_renamedata ovlrd = { + .old_parent = old->d_parent, + .old_dentry = old, + .new_parent = new->d_parent, + .new_dentry = new, + .flags = flags, + .overwrite = !(flags & RENAME_EXCHANGE), + }; + LIST_HEAD(list); + int err; + + err = ovl_rename_start(&ovlrd, &list); + if (!err) { + with_ovl_creds(old->d_sb) + err = ovl_rename_upper(&ovlrd, &list); + ovl_rename_end(&ovlrd); + } + + dput(ovlrd.opaquedir); ovl_cache_free(&list); return err; - -out_unlock: - unlock_rename(new_upperdir, old_upperdir); - goto out_revert_creds; } static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, struct inode *inode, umode_t mode) { - const struct cred *old_cred, *new_cred = NULL; struct path realparentpath; struct file *realfile; struct ovl_file *of; @@ -1346,41 +1364,36 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - old_cred = ovl_override_creds(dentry->d_sb); - new_cred = ovl_setup_cred_for_create(dentry, inode, mode, old_cred); - err = PTR_ERR(new_cred); - if (IS_ERR(new_cred)) { - new_cred = NULL; - goto out_revert_creds; - } + with_ovl_creds(dentry->d_sb) { + scoped_class(ovl_override_creator_creds, cred, dentry, inode, mode) { + if (IS_ERR(cred)) + return PTR_ERR(cred); - ovl_path_upper(dentry->d_parent, &realparentpath); - realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, - mode, current_cred()); - err = PTR_ERR_OR_ZERO(realfile); - pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); - if (err) - goto out_revert_creds; + ovl_path_upper(dentry->d_parent, &realparentpath); + realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath, + mode, current_cred()); + err = PTR_ERR_OR_ZERO(realfile); + pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err); + if (err) + return err; - of = ovl_file_alloc(realfile); - if (!of) { - fput(realfile); - err = -ENOMEM; - goto out_revert_creds; - } + of = ovl_file_alloc(realfile); + if (!of) { + fput(realfile); + return -ENOMEM; + } - /* ovl_instantiate() consumes the newdentry reference on success */ - newdentry = dget(realfile->f_path.dentry); - err = ovl_instantiate(dentry, inode, newdentry, false, file); - if (!err) { - file->private_data = of; - } else { - dput(newdentry); - ovl_file_free(of); + /* ovl_instantiate() consumes the newdentry reference on success */ + newdentry = dget(realfile->f_path.dentry); + err = ovl_instantiate(dentry, inode, newdentry, false, file); + if (!err) { + file->private_data = of; + } else { + dput(newdentry); + ovl_file_free(of); + } + } } -out_revert_creds: - ovl_revert_creds(old_cred); - put_cred(new_cred); return err; } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7ab2c9daffd0..cbae89457234 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -31,7 +31,6 @@ static struct file *ovl_open_realfile(const struct file *file, struct inode *inode = file_inode(file); struct mnt_idmap *real_idmap; struct file *realfile; - const struct cred *old_cred; int flags = file->f_flags | OVL_OPEN_FLAGS; int acc_mode = ACC_MODE(flags); int err; @@ -39,19 +38,19 @@ static struct file *ovl_open_realfile(const struct file *file, if (flags & O_APPEND) acc_mode |= MAY_APPEND; - old_cred = ovl_override_creds(inode->i_sb); - real_idmap = mnt_idmap(realpath->mnt); - err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); - if (err) { - realfile = ERR_PTR(err); - } else { - if (!inode_owner_or_capable(real_idmap, realinode)) - flags &= ~O_NOATIME; - - realfile = backing_file_open(file_user_path(file), - flags, realpath, current_cred()); + with_ovl_creds(inode->i_sb) { + real_idmap = mnt_idmap(realpath->mnt); + err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); + if (err) { + realfile = ERR_PTR(err); + } else { + if (!inode_owner_or_capable(real_idmap, realinode)) + flags &= ~O_NOATIME; + + realfile = backing_file_open(file_user_path(file), + flags, realpath, current_cred()); + } } - ovl_revert_creds(old_cred); pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, @@ -244,7 +243,6 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); struct file *realfile; - const struct cred *old_cred; loff_t ret; /* @@ -273,9 +271,8 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) ovl_inode_lock(inode); realfile->f_pos = file->f_pos; - old_cred = ovl_override_creds(inode->i_sb); - ret = vfs_llseek(realfile, offset, whence); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + ret = vfs_llseek(realfile, offset, whence); file->f_pos = realfile->f_pos; ovl_inode_unlock(inode); @@ -447,7 +444,6 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) enum ovl_path_type type; struct path upperpath; struct file *upperfile; - const struct cred *old_cred; int ret; ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); @@ -464,11 +460,8 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (IS_ERR(upperfile)) return PTR_ERR(upperfile); - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fsync_range(upperfile, start, end, datasync); - ovl_revert_creds(old_cred); - - return ret; + with_ovl_creds(file_inode(file)->i_sb) + return vfs_fsync_range(upperfile, start, end, datasync); } static int ovl_mmap(struct file *file, struct vm_area_struct *vma) @@ -486,7 +479,6 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len { struct inode *inode = file_inode(file); struct file *realfile; - const struct cred *old_cred; int ret; inode_lock(inode); @@ -501,9 +493,8 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len if (IS_ERR(realfile)) goto out_unlock; - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fallocate(realfile, mode, offset, len); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + ret = vfs_fallocate(realfile, mode, offset, len); /* Update size */ ovl_file_modified(file); @@ -517,18 +508,13 @@ out_unlock: static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct file *realfile; - const struct cred *old_cred; - int ret; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_fadvise(realfile, offset, len, advice); - ovl_revert_creds(old_cred); - - return ret; + with_ovl_creds(file_inode(file)->i_sb) + return vfs_fadvise(realfile, offset, len, advice); } enum ovl_copyop { @@ -543,7 +529,6 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, { struct inode *inode_out = file_inode(file_out); struct file *realfile_in, *realfile_out; - const struct cred *old_cred; loff_t ret; inode_lock(inode_out); @@ -565,25 +550,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, if (IS_ERR(realfile_in)) goto out_unlock; - old_cred = ovl_override_creds(file_inode(file_out)->i_sb); - switch (op) { - case OVL_COPY: - ret = vfs_copy_file_range(realfile_in, pos_in, - realfile_out, pos_out, len, flags); - break; - - case OVL_CLONE: - ret = vfs_clone_file_range(realfile_in, pos_in, - realfile_out, pos_out, len, flags); - break; - - case OVL_DEDUPE: - ret = vfs_dedupe_file_range_one(realfile_in, pos_in, - realfile_out, pos_out, len, - flags); - break; + with_ovl_creds(file_inode(file_out)->i_sb) { + switch (op) { + case OVL_COPY: + ret = vfs_copy_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); + break; + + case OVL_CLONE: + ret = vfs_clone_file_range(realfile_in, pos_in, + realfile_out, pos_out, len, flags); + break; + + case OVL_DEDUPE: + ret = vfs_dedupe_file_range_one(realfile_in, pos_in, + realfile_out, pos_out, len, + flags); + break; + } } - ovl_revert_creds(old_cred); /* Update size */ ovl_file_modified(file_out); @@ -632,7 +617,6 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, static int ovl_flush(struct file *file, fl_owner_t id) { struct file *realfile; - const struct cred *old_cred; int err = 0; realfile = ovl_real_file(file); @@ -640,9 +624,8 @@ static int ovl_flush(struct file *file, fl_owner_t id) return PTR_ERR(realfile); if (realfile->f_op->flush) { - old_cred = ovl_override_creds(file_inode(file)->i_sb); - err = realfile->f_op->flush(realfile, id); - ovl_revert_creds(old_cred); + with_ovl_creds(file_inode(file)->i_sb) + err = realfile->f_op->flush(realfile, id); } return err; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index e11f310ce092..bdbf86b56a9b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -25,7 +25,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct ovl_fs *ofs = OVL_FS(dentry->d_sb); bool full_copy_up = false; struct dentry *upperdentry; - const struct cred *old_cred; err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) @@ -78,9 +77,8 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, goto out_put_write; inode_lock(upperdentry->d_inode); - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_do_notify_change(ofs, upperdentry, attr); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_do_notify_change(ofs, upperdentry, attr); if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); @@ -153,13 +151,22 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) } } +static inline int ovl_real_getattr_nosec(struct super_block *sb, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int flags) +{ + with_ovl_creds(sb) + return vfs_getattr_nosec(path, stat, request_mask, flags); +} + int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + struct super_block *sb = dentry->d_sb; enum ovl_path_type type; struct path realpath; - const struct cred *old_cred; struct inode *inode = d_inode(dentry); bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; @@ -169,10 +176,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, metacopy_blocks = ovl_is_metacopy_dentry(dentry); type = ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr_nosec(&realpath, stat, request_mask, flags); + err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags); if (err) - goto out; + return err; /* Report the effective immutable/append-only STATX flags */ generic_fill_statx_attr(inode, stat); @@ -195,10 +201,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask, - flags); + err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags); if (err) - goto out; + return err; /* * Lower hardlinks may be broken on copy up to different @@ -248,10 +253,10 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = vfs_getattr_nosec(&realpath, &lowerdatastat, - lowermask, flags); + err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat, + lowermask, flags); if (err) - goto out; + return err; } else { lowerdatastat.blocks = round_up(stat->size, stat->blksize) >> 9; @@ -279,9 +284,6 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->nlink = dentry->d_inode->i_nlink; -out: - ovl_revert_creds(old_cred); - return err; } @@ -291,7 +293,6 @@ int ovl_permission(struct mnt_idmap *idmap, struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode; struct path realpath; - const struct cred *old_cred; int err; /* Careful in RCU walk mode */ @@ -309,33 +310,26 @@ int ovl_permission(struct mnt_idmap *idmap, if (err) return err; - old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { mask &= ~(MAY_WRITE | MAY_APPEND); /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask); - ovl_revert_creds(old_cred); - return err; + with_ovl_creds(inode->i_sb) + return inode_permission(mnt_idmap(realpath.mnt), realinode, mask); } static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - const struct cred *old_cred; - const char *p; - if (!dentry) return ERR_PTR(-ECHILD); - old_cred = ovl_override_creds(dentry->d_sb); - p = vfs_get_link(ovl_dentry_real(dentry), done); - ovl_revert_creds(old_cred); - return p; + with_ovl_creds(dentry->d_sb) + return vfs_get_link(ovl_dentry_real(dentry), done); } #ifdef CONFIG_FS_POSIX_ACL @@ -465,11 +459,8 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, acl = get_cached_acl_rcu(realinode, type); } else { - const struct cred *old_cred; - - old_cred = ovl_override_creds(inode->i_sb); - acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) + acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); } return acl; @@ -481,7 +472,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, int err; struct path realpath; const char *acl_name; - const struct cred *old_cred; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); @@ -495,10 +485,8 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct posix_acl *real_acl; ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, - acl_name); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); goto out; @@ -518,12 +506,12 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (acl) - err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); - else - err = ovl_do_remove_acl(ofs, realdentry, acl_name); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) { + if (acl) + err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); + else + err = ovl_do_remove_acl(ofs, realdentry, acl_name); + } ovl_drop_write(dentry); /* copy c/mtime */ @@ -588,9 +576,7 @@ int ovl_update_time(struct inode *inode, int flags) static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - int err; struct inode *realinode = ovl_inode_realdata(inode); - const struct cred *old_cred; if (!realinode) return -EIO; @@ -598,11 +584,8 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!realinode->i_op->fiemap) return -EOPNOTSUPP; - old_cred = ovl_override_creds(inode->i_sb); - err = realinode->i_op->fiemap(realinode, fieinfo, start, len); - ovl_revert_creds(old_cred); - - return err; + with_ovl_creds(inode->i_sb) + return realinode->i_op->fiemap(realinode, fieinfo, start, len); } /* @@ -653,7 +636,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, { struct inode *inode = d_inode(dentry); struct path upperpath; - const struct cred *old_cred; unsigned int flags; int err; @@ -665,18 +647,18 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, if (err) goto out; - old_cred = ovl_override_creds(inode->i_sb); - /* - * Store immutable/append-only flags in xattr and clear them - * in upper fileattr (in case they were set by older kernel) - * so children of "ovl-immutable" directories lower aliases of - * "ovl-immutable" hardlinks could be copied up. - * Clear xattr when flags are cleared. - */ - err = ovl_set_protattr(inode, upperpath.dentry, fa); - if (!err) - err = ovl_real_fileattr_set(&upperpath, fa); - ovl_revert_creds(old_cred); + with_ovl_creds(inode->i_sb) { + /* + * Store immutable/append-only flags in xattr and clear them + * in upper fileattr (in case they were set by older kernel) + * so children of "ovl-immutable" directories lower aliases of + * "ovl-immutable" hardlinks could be copied up. + * Clear xattr when flags are cleared. + */ + err = ovl_set_protattr(inode, upperpath.dentry, fa); + if (!err) + err = ovl_real_fileattr_set(&upperpath, fa); + } ovl_drop_write(dentry); /* @@ -730,15 +712,13 @@ int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; - const struct cred *old_cred; int err; ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(inode->i_sb); - err = ovl_real_fileattr_get(&realpath, fa); + with_ovl_creds(inode->i_sb) + err = ovl_real_fileattr_get(&realpath, fa); ovl_fileattr_prot_flags(inode, fa); - ovl_revert_creds(old_cred); return err; } @@ -1152,7 +1132,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) if (!trap) return ERR_PTR(-ENOMEM); - if (!(trap->i_state & I_NEW)) { + if (!(inode_state_read_once(trap) & I_NEW)) { /* Conflicting layer roots? */ iput(trap); return ERR_PTR(-ELOOP); @@ -1243,7 +1223,7 @@ struct inode *ovl_get_inode(struct super_block *sb, inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_err; - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { /* * Verify that the underlying files stored in the inode * match those in the dentry. @@ -1303,7 +1283,7 @@ struct inode *ovl_get_inode(struct super_block *sb, if (upperdentry) ovl_check_protattr(inode, upperdentry); - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) unlock_new_inode(inode); out: return inode; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e93bcc5727bc..e9a69c95be91 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -979,15 +979,10 @@ static int ovl_maybe_validate_verity(struct dentry *dentry) return err; if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - - err = ovl_validate_verity(ofs, &metapath, &datapath); + with_ovl_creds(dentry->d_sb) + err = ovl_validate_verity(ofs, &metapath, &datapath); if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); - - ovl_revert_creds(old_cred); } ovl_inode_unlock(inode); @@ -1001,7 +996,6 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); struct ovl_path datapath = {}; - const struct cred *old_cred; int err; if (!redirect || ovl_dentry_lowerdata(dentry)) @@ -1019,9 +1013,8 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) if (ovl_dentry_lowerdata(dentry)) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_lookup_data_layers(dentry, redirect, &datapath); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_lookup_data_layers(dentry, redirect, &datapath); if (err) goto out_err; @@ -1077,57 +1070,44 @@ static bool ovl_check_follow_redirect(struct ovl_lookup_data *d) return true; } -struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) +struct ovl_lookup_ctx { + struct dentry *dentry; + struct ovl_entry *oe; + struct ovl_path *stack; + struct ovl_path *origin_path; + struct dentry *upperdentry; + struct dentry *index; + struct inode *inode; + unsigned int ctr; +}; + +static int ovl_lookup_layers(struct ovl_lookup_ctx *ctx, struct ovl_lookup_data *d) { - struct ovl_entry *oe = NULL; - const struct cred *old_cred; + struct dentry *dentry = ctx->dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); - struct ovl_path *stack = NULL, *origin_path = NULL; - struct dentry *upperdir, *upperdentry = NULL; - struct dentry *origin = NULL; - struct dentry *index = NULL; - unsigned int ctr = 0; - struct inode *inode = NULL; - bool upperopaque = false; bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); + struct dentry *upperdir; struct dentry *this; - unsigned int i; - int err; + struct dentry *origin = NULL; + bool upperopaque = false; bool uppermetacopy = false; int metacopy_size = 0; - struct ovl_lookup_data d = { - .sb = dentry->d_sb, - .dentry = dentry, - .name = dentry->d_name, - .is_dir = false, - .opaque = false, - .stop = false, - .last = check_redirect ? false : !ovl_numlower(poe), - .redirect = NULL, - .upperredirect = NULL, - .metacopy = 0, - }; - - if (dentry->d_name.len > ofs->namelen) - return ERR_PTR(-ENAMETOOLONG); + unsigned int i; + int err; - old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { - d.layer = &ofs->layers[0]; - err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); + d->layer = &ofs->layers[0]; + err = ovl_lookup_layer(upperdir, d, &ctx->upperdentry, true); if (err) - goto out; + return err; - if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { - dput(upperdentry); - err = -EREMOTE; - goto out; - } - if (upperdentry && !d.is_dir) { + if (ctx->upperdentry && ctx->upperdentry->d_flags & DCACHE_OP_REAL) + return -EREMOTE; + + if (ctx->upperdentry && !d->is_dir) { /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, @@ -1138,50 +1118,50 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ - err = ovl_check_origin(ofs, upperdentry, &origin_path); + err = ovl_check_origin(ofs, ctx->upperdentry, &ctx->origin_path); if (err) - goto out_put_upper; + return err; - if (d.metacopy) + if (d->metacopy) uppermetacopy = true; - metacopy_size = d.metacopy; + metacopy_size = d->metacopy; } - if (d.redirect) { + if (d->redirect) { err = -ENOMEM; - d.upperredirect = kstrdup(d.redirect, GFP_KERNEL); - if (!d.upperredirect) - goto out_put_upper; - if (d.redirect[0] == '/') + d->upperredirect = kstrdup(d->redirect, GFP_KERNEL); + if (!d->upperredirect) + return err; + if (d->redirect[0] == '/') poe = roe; } - upperopaque = d.opaque; + upperopaque = d->opaque; } - if (!d.stop && ovl_numlower(poe)) { + if (!d->stop && ovl_numlower(poe)) { err = -ENOMEM; - stack = ovl_stack_alloc(ofs->numlayer - 1); - if (!stack) - goto out_put_upper; + ctx->stack = ovl_stack_alloc(ofs->numlayer - 1); + if (!ctx->stack) + return err; } - for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { + for (i = 0; !d->stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; - if (!ovl_check_follow_redirect(&d)) { + if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_put; + return err; } if (!check_redirect) - d.last = i == ovl_numlower(poe) - 1; - else if (d.is_dir || !ofs->numdatalayer) - d.last = lower.layer->idx == ovl_numlower(roe); + d->last = i == ovl_numlower(poe) - 1; + else if (d->is_dir || !ofs->numdatalayer) + d->last = lower.layer->idx == ovl_numlower(roe); - d.layer = lower.layer; - err = ovl_lookup_layer(lower.dentry, &d, &this, false); + d->layer = lower.layer; + err = ovl_lookup_layer(lower.dentry, d, &this, false); if (err) - goto out_put; + return err; if (!this) continue; @@ -1190,11 +1170,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ - if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { - err = ovl_fix_origin(ofs, dentry, this, upperdentry); + if (ctx->upperdentry && !ctx->ctr && !ofs->noxattr && d->is_dir) { + err = ovl_fix_origin(ofs, dentry, this, ctx->upperdentry); if (err) { dput(this); - goto out_put; + return err; } } @@ -1207,23 +1187,23 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * matches the dentry found using path based lookup, * otherwise error out. */ - if (upperdentry && !ctr && - ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || - (!d.is_dir && ofs->config.index && origin_path))) { - err = ovl_verify_origin(ofs, upperdentry, this, false); + if (ctx->upperdentry && !ctx->ctr && + ((d->is_dir && ovl_verify_lower(dentry->d_sb)) || + (!d->is_dir && ofs->config.index && ctx->origin_path))) { + err = ovl_verify_origin(ofs, ctx->upperdentry, this, false); if (err) { dput(this); - if (d.is_dir) + if (d->is_dir) break; - goto out_put; + return err; } origin = this; } - if (!upperdentry && !d.is_dir && !ctr && d.metacopy) - metacopy_size = d.metacopy; + if (!ctx->upperdentry && !d->is_dir && !ctx->ctr && d->metacopy) + metacopy_size = d->metacopy; - if (d.metacopy && ctr) { + if (d->metacopy && ctx->ctr) { /* * Do not store intermediate metacopy dentries in * lower chain, except top most lower metacopy dentry. @@ -1233,15 +1213,15 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, dput(this); this = NULL; } else { - stack[ctr].dentry = this; - stack[ctr].layer = lower.layer; - ctr++; + ctx->stack[ctx->ctr].dentry = this; + ctx->stack[ctx->ctr].layer = lower.layer; + ctx->ctr++; } - if (d.stop) + if (d->stop) break; - if (d.redirect && d.redirect[0] == '/' && poe != roe) { + if (d->redirect && d->redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ i = lower.layer->idx - 1; @@ -1252,12 +1232,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * Defer lookup of lowerdata in data-only layers to first access. * Don't require redirect=follow and metacopy=on in this case. */ - if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { - d.metacopy = 0; - ctr++; - } else if (!ovl_check_follow_redirect(&d)) { + if (d->metacopy && ctx->ctr && ofs->numdatalayer && d->absolute_redirect) { + d->metacopy = 0; + ctx->ctr++; + } else if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_put; + return err; } /* @@ -1268,20 +1248,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * For metacopy dentry, path based lookup will find lower dentries. * Just make sure a corresponding data dentry has been found. */ - if (d.metacopy || (uppermetacopy && !ctr)) { + if (d->metacopy || (uppermetacopy && !ctx->ctr)) { pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", dentry); err = -EIO; - goto out_put; - } else if (!d.is_dir && upperdentry && !ctr && origin_path) { - if (WARN_ON(stack != NULL)) { + return err; + } else if (!d->is_dir && ctx->upperdentry && !ctx->ctr && ctx->origin_path) { + if (WARN_ON(ctx->stack != NULL)) { err = -EIO; - goto out_put; + return err; } - stack = origin_path; - ctr = 1; - origin = origin_path->dentry; - origin_path = NULL; + ctx->stack = ctx->origin_path; + ctx->ctr = 1; + origin = ctx->origin_path->dentry; + ctx->origin_path = NULL; } /* @@ -1303,38 +1283,39 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * is enabled and if upper had an ORIGIN xattr. * */ - if (!upperdentry && ctr) - origin = stack[0].dentry; + if (!ctx->upperdentry && ctx->ctr) + origin = ctx->stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && - (!d.is_dir || ovl_index_all(dentry->d_sb))) { - index = ovl_lookup_index(ofs, upperdentry, origin, true); - if (IS_ERR(index)) { - err = PTR_ERR(index); - index = NULL; - goto out_put; + (!d->is_dir || ovl_index_all(dentry->d_sb))) { + ctx->index = ovl_lookup_index(ofs, ctx->upperdentry, origin, true); + if (IS_ERR(ctx->index)) { + err = PTR_ERR(ctx->index); + ctx->index = NULL; + return err; } } - if (ctr) { - oe = ovl_alloc_entry(ctr); + if (ctx->ctr) { + ctx->oe = ovl_alloc_entry(ctx->ctr); err = -ENOMEM; - if (!oe) - goto out_put; + if (!ctx->oe) + return err; - ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr); + ovl_stack_cpy(ovl_lowerstack(ctx->oe), ctx->stack, ctx->ctr); } if (upperopaque) ovl_dentry_set_opaque(dentry); - if (d.xwhiteouts) + if (d->xwhiteouts) ovl_dentry_set_xwhiteouts(dentry); - if (upperdentry) + if (ctx->upperdentry) ovl_dentry_set_upper_alias(dentry); - else if (index) { + else if (ctx->index) { + char *upperredirect; struct path upperpath = { - .dentry = upperdentry = dget(index), + .dentry = ctx->upperdentry = dget(ctx->index), .mnt = ovl_upper_mnt(ofs), }; @@ -1343,84 +1324,100 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * assignment happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ - d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); - if (IS_ERR(d.upperredirect)) { - err = PTR_ERR(d.upperredirect); - d.upperredirect = NULL; - goto out_free_oe; - } + upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); + if (IS_ERR(upperredirect)) + return PTR_ERR(upperredirect); + d->upperredirect = upperredirect; err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) - goto out_free_oe; - d.metacopy = uppermetacopy = err; + return err; + d->metacopy = uppermetacopy = err; metacopy_size = err; - if (!ovl_check_follow_redirect(&d)) { + if (!ovl_check_follow_redirect(d)) { err = -EPERM; - goto out_free_oe; + return err; } } - if (upperdentry || ctr) { + if (ctx->upperdentry || ctx->ctr) { + struct inode *inode; struct ovl_inode_params oip = { - .upperdentry = upperdentry, - .oe = oe, - .index = index, - .redirect = d.upperredirect, + .upperdentry = ctx->upperdentry, + .oe = ctx->oe, + .index = ctx->index, + .redirect = d->upperredirect, }; /* Store lowerdata redirect for lazy lookup */ - if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) { - oip.lowerdata_redirect = d.redirect; - d.redirect = NULL; + if (ctx->ctr > 1 && !d->is_dir && !ctx->stack[ctx->ctr - 1].dentry) { + oip.lowerdata_redirect = d->redirect; + d->redirect = NULL; } + inode = ovl_get_inode(dentry->d_sb, &oip); - err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_free_oe; - if (upperdentry && !uppermetacopy) - ovl_set_flag(OVL_UPPERDATA, inode); + return PTR_ERR(inode); + + ctx->inode = inode; + if (ctx->upperdentry && !uppermetacopy) + ovl_set_flag(OVL_UPPERDATA, ctx->inode); if (metacopy_size > OVL_METACOPY_MIN_SIZE) - ovl_set_flag(OVL_HAS_DIGEST, inode); + ovl_set_flag(OVL_HAS_DIGEST, ctx->inode); } - ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); + ovl_dentry_init_reval(dentry, ctx->upperdentry, OVL_I_E(ctx->inode)); + + return 0; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct ovl_entry *poe = OVL_E(dentry->d_parent); + bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer); + int err; + struct ovl_lookup_ctx ctx = { + .dentry = dentry, + }; + struct ovl_lookup_data d = { + .sb = dentry->d_sb, + .dentry = dentry, + .name = dentry->d_name, + .last = check_redirect ? false : !ovl_numlower(poe), + }; + + if (dentry->d_name.len > ofs->namelen) + return ERR_PTR(-ENAMETOOLONG); + + with_ovl_creds(dentry->d_sb) + err = ovl_lookup_layers(&ctx, &d); - ovl_revert_creds(old_cred); - if (origin_path) { - dput(origin_path->dentry); - kfree(origin_path); + if (ctx.origin_path) { + dput(ctx.origin_path->dentry); + kfree(ctx.origin_path); } - dput(index); - ovl_stack_free(stack, ctr); + dput(ctx.index); + ovl_stack_free(ctx.stack, ctx.ctr); kfree(d.redirect); - return d_splice_alias(inode, dentry); -out_free_oe: - ovl_free_entry(oe); -out_put: - dput(index); - ovl_stack_free(stack, ctr); -out_put_upper: - if (origin_path) { - dput(origin_path->dentry); - kfree(origin_path); + if (err) { + ovl_free_entry(ctx.oe); + dput(ctx.upperdentry); + kfree(d.upperredirect); + return ERR_PTR(err); } - dput(upperdentry); - kfree(d.upperredirect); -out: - kfree(d.redirect); - ovl_revert_creds(old_cred); - return ERR_PTR(err); + + return d_splice_alias(ctx.inode, dentry); } bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; - const struct cred *old_cred; unsigned int i; bool positive = false; bool done = false; @@ -1436,46 +1433,45 @@ bool ovl_lower_positive(struct dentry *dentry) if (!ovl_dentry_upper(dentry)) return true; - old_cred = ovl_override_creds(dentry->d_sb); - /* Positive upper -> have to look up lower to see whether it exists */ - for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { - struct dentry *this; - struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; + with_ovl_creds(dentry->d_sb) { + /* Positive upper -> have to look up lower to see whether it exists */ + for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { + struct dentry *this; + struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; - /* - * We need to make a non-const copy of dentry->d_name, - * because lookup_one_positive_unlocked() will hash name - * with parentpath base, which is on another (lower fs). - */ - this = lookup_one_positive_unlocked( - mnt_idmap(parentpath->layer->mnt), - &QSTR_LEN(name->name, name->len), - parentpath->dentry); - if (IS_ERR(this)) { - switch (PTR_ERR(this)) { - case -ENOENT: - case -ENAMETOOLONG: - break; - - default: - /* - * Assume something is there, we just couldn't - * access it. - */ - positive = true; - break; + /* + * We need to make a non-const copy of dentry->d_name, + * because lookup_one_positive_unlocked() will hash name + * with parentpath base, which is on another (lower fs). + */ + this = lookup_one_positive_unlocked(mnt_idmap(parentpath->layer->mnt), + &QSTR_LEN(name->name, name->len), + parentpath->dentry); + if (IS_ERR(this)) { + switch (PTR_ERR(this)) { + case -ENOENT: + case -ENAMETOOLONG: + break; + + default: + /* + * Assume something is there, we just couldn't + * access it. + */ + positive = true; + break; + } + } else { + struct path path = { + .dentry = this, + .mnt = parentpath->layer->mnt, + }; + positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); + done = true; + dput(this); } - } else { - struct path path = { - .dentry = this, - .mnt = parentpath->layer->mnt, - }; - positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); - done = true; - dput(this); } } - ovl_revert_creds(old_cred); return positive; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index c8fd5951fc5e..f9ac9bdde830 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -206,7 +206,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs, static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); + int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); + int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode, NULL); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -248,7 +248,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, { struct dentry *ret; - ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, NULL); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret)); return ret; } @@ -257,7 +257,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); + int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev, NULL); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -267,7 +267,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); + int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname, NULL); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -355,11 +355,24 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } +static inline int ovl_do_rename_rd(struct renamedata *rd) +{ + int err; + + pr_debug("rename(%pd2, %pd2, 0x%x)\n", rd->old_dentry, rd->new_dentry, + rd->flags); + err = vfs_rename(rd); + if (err) { + pr_debug("...rename(%pd2, %pd2, ...) = %i\n", + rd->old_dentry, rd->new_dentry, err); + } + return err; +} + static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, struct dentry *olddentry, struct dentry *newdir, struct dentry *newdentry, unsigned int flags) { - int err; struct renamedata rd = { .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, @@ -369,13 +382,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, .flags = flags, }; - pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); - err = vfs_rename(&rd); - if (err) { - pr_debug("...rename(%pd2, %pd2, ...) = %i\n", - olddentry, newdentry, err); - } - return err; + return ovl_do_rename_rd(&rd); } static inline int ovl_do_whiteout(struct ovl_fs *ofs, @@ -415,6 +422,22 @@ static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs, &QSTR_LEN(name, len), base); } +static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_creating(ovl_upper_mnt_idmap(ofs), + parent, name); +} + +static inline struct dentry *ovl_start_removing_upper(struct ovl_fs *ofs, + struct dentry *parent, + struct qstr *name) +{ + return start_removing(ovl_upper_mnt_idmap(ofs), + parent, name); +} + static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) @@ -424,11 +447,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) } /* util.c */ -int ovl_parent_lock(struct dentry *parent, struct dentry *child); -static inline void ovl_parent_unlock(struct dentry *parent) -{ - inode_unlock(parent->d_inode); -} int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); @@ -437,7 +455,11 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -void ovl_revert_creds(const struct cred *old_cred); + +EXTEND_CLASS(override_creds, _ovl, ovl_override_creds(sb), struct super_block *sb) + +#define with_ovl_creds(sb) \ + scoped_class(override_creds_ovl, __UNIQUE_ID(label), sb) static inline const struct cred *ovl_creds(struct super_block *sb) { @@ -865,7 +887,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry); -struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir); +#define OVL_TEMPNAME_SIZE 20 +void ovl_tempname(char name[OVL_TEMPNAME_SIZE]); struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 1e9792cc557b..160960bb0ad0 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -348,11 +348,7 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err = 0; struct dentry *dentry, *dir = path->dentry; - const struct cred *old_cred; - - old_cred = ovl_override_creds(rdd->dentry->d_sb); while (rdd->first_maybe_whiteout) { struct ovl_cache_entry *p = @@ -365,13 +361,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } else if (PTR_ERR(dentry) == -EINTR) { - err = -EINTR; - break; + return -EINTR; } } - ovl_revert_creds(old_cred); - return err; + return 0; } static inline int ovl_dir_read(const struct path *realpath, @@ -838,36 +832,12 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return err; } - -static int ovl_iterate(struct file *file, struct dir_context *ctx) +static int ovl_iterate_merged(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; - const struct cred *old_cred; - int err; - - old_cred = ovl_override_creds(dentry->d_sb); - if (!ctx->pos) - ovl_dir_reset(file); - - if (od->is_real) { - /* - * If parent is merge, then need to adjust d_ino for '..', if - * dir is impure then need to adjust d_ino for copied up - * entries. - */ - if (ovl_xino_bits(ofs) || - (ovl_same_fs(ofs) && - (ovl_is_impure_dir(file) || - OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { - err = ovl_iterate_real(file, ctx); - } else { - err = iterate_dir(od->realfile, ctx); - } - goto out; - } + int err = 0; if (!od->cache) { struct ovl_dir_cache *cache; @@ -875,7 +845,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) - goto out; + return err; od->cache = cache; ovl_seek_cursor(od, ctx->pos); @@ -887,7 +857,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) - goto out; + return err; } } /* ovl_cache_update() sets is_whiteout on stale entry */ @@ -898,12 +868,50 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) od->cursor = p->l_node.next; ctx->pos++; } - err = 0; -out: - ovl_revert_creds(old_cred); return err; } +static bool ovl_need_adjust_d_ino(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + + /* If parent is merge, then need to adjust d_ino for '..' */ + if (ovl_xino_bits(ofs)) + return true; + + /* Can't do consistent inode numbering */ + if (!ovl_same_fs(ofs)) + return false; + + /* If dir is impure then need to adjust d_ino for copied up entries */ + if (ovl_is_impure_dir(file) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))) + return true; + + /* Pure: no need to adjust d_ino */ + return false; +} + + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + + if (!ctx->pos) + ovl_dir_reset(file); + + with_ovl_creds(file_dentry(file)->d_sb) { + if (!od->is_real) + return ovl_iterate_merged(file, ctx); + + if (ovl_need_adjust_d_ino(file)) + return ovl_iterate_real(file, ctx); + + return iterate_dir(od->realfile, ctx); + } +} + static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; @@ -947,14 +955,8 @@ out_unlock: static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { - struct file *res; - const struct cred *old_cred; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); - ovl_revert_creds(old_cred); - - return res; + with_ovl_creds(file_inode(file)->i_sb) + return ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); } /* @@ -1075,11 +1077,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; - const struct cred *old_cred; - old_cred = ovl_override_creds(dentry->d_sb); - err = ovl_dir_read_merged(dentry, list, &root); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; @@ -1242,11 +1242,11 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent, if (!d_is_dir(dentry) || level > 1) return ovl_cleanup(ofs, parent, dentry); - err = ovl_parent_lock(parent, dentry); - if (err) - return err; + dentry = start_removing_dentry(parent, dentry); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); err = ovl_do_rmdir(ofs, parent->d_inode, dentry); - ovl_parent_unlock(parent); + end_removing(dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 43ee4c7296a7..28b2f707cfbc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -310,8 +310,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, bool retried = false; retry: - inode_lock_nested(dir, I_MUTEX_PARENT); - work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name)); + work = ovl_start_creating_upper(ofs, ofs->workbasedir, &QSTR(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -320,14 +319,12 @@ retry: }; if (work->d_inode) { + end_creating_keep(work); + if (persist) + return work; err = -EEXIST; - inode_unlock(dir); if (retried) goto out_dput; - - if (persist) - return work; - retried = true; err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0); dput(work); @@ -338,7 +335,7 @@ retry: } work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); - inode_unlock(dir); + end_creating_keep(work); err = PTR_ERR(work); if (IS_ERR(work)) goto out_err; @@ -376,7 +373,6 @@ retry: if (err) goto out_dput; } else { - inode_unlock(dir); err = PTR_ERR(work); goto out_err; } @@ -567,9 +563,10 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) { struct dentry *workdir = ofs->workdir; struct dentry *temp; - struct dentry *dest; struct dentry *whiteout; struct name_snapshot name; + struct renamedata rd = {}; + char name2[OVL_TEMPNAME_SIZE]; int err; temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0)); @@ -577,23 +574,21 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) if (IS_ERR(temp)) return err; - err = ovl_parent_lock(workdir, temp); + rd.mnt_idmap = ovl_upper_mnt_idmap(ofs); + rd.old_parent = workdir; + rd.new_parent = workdir; + rd.flags = RENAME_WHITEOUT; + ovl_tempname(name2); + err = start_renaming_dentry(&rd, 0, temp, &QSTR(name2)); if (err) { dput(temp); return err; } - dest = ovl_lookup_temp(ofs, workdir); - err = PTR_ERR(dest); - if (IS_ERR(dest)) { - dput(temp); - ovl_parent_unlock(workdir); - return err; - } /* Name is inline and stable - using snapshot as a copy helper */ take_dentry_name_snapshot(&name, temp); - err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT); - ovl_parent_unlock(workdir); + err = ovl_do_rename_rd(&rd); + end_renaming(&rd); if (err) { if (err == -EINVAL) err = 0; @@ -617,7 +612,6 @@ cleanup_temp: ovl_cleanup(ofs, workdir, temp); release_dentry_name_snapshot(&name); dput(temp); - dput(dest); return err; } @@ -626,14 +620,15 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs, struct dentry *parent, const char *name, umode_t mode) { - size_t len = strlen(name); struct dentry *child; - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - child = ovl_lookup_upper(ofs, name, parent, len); - if (!IS_ERR(child) && !child->d_inode) - child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode)); - inode_unlock(parent->d_inode); + child = ovl_start_creating_upper(ofs, parent, &QSTR(name)); + if (!IS_ERR(child)) { + if (!child->d_inode) + child = ovl_create_real(ofs, parent, child, + OVL_CATTR(mode)); + end_creating_keep(child); + } dput(parent); return child; @@ -1369,53 +1364,35 @@ static void ovl_set_d_op(struct super_block *sb) set_default_d_op(sb, &ovl_dentry_operations); } -int ovl_fill_super(struct super_block *sb, struct fs_context *fc) +static int ovl_fill_super_creds(struct fs_context *fc, struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; + struct cred *creator_cred = (struct cred *)ofs->creator_cred; struct ovl_fs_context *ctx = fc->fs_private; - const struct cred *old_cred = NULL; - struct dentry *root_dentry; - struct ovl_entry *oe; struct ovl_layer *layers; - struct cred *cred; + struct ovl_entry *oe = NULL; int err; - err = -EIO; - if (WARN_ON(fc->user_ns != current_user_ns())) - goto out_err; - - ovl_set_d_op(sb); - - err = -ENOMEM; - if (!ofs->creator_cred) - ofs->creator_cred = cred = prepare_creds(); - else - cred = (struct cred *)ofs->creator_cred; - if (!cred) - goto out_err; - - old_cred = ovl_override_creds(sb); - err = ovl_fs_params_verify(ctx, &ofs->config); if (err) - goto out_err; + return err; err = -EINVAL; if (ctx->nr == 0) { if (!(fc->sb_flags & SB_SILENT)) pr_err("missing 'lowerdir'\n"); - goto out_err; + return err; } err = -ENOMEM; layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL); if (!layers) - goto out_err; + return err; ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); if (!ofs->config.lowerdirs) { kfree(layers); - goto out_err; + return err; } ofs->layers = layers; /* @@ -1448,12 +1425,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) err = -EINVAL; if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); - goto out_err; + return err; } err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper); if (err) - goto out_err; + return err; upper_sb = ovl_upper_mnt(ofs)->mnt_sb; if (!ovl_should_sync(ofs)) { @@ -1461,13 +1438,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { err = -EIO; pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); - goto out_err; + return err; } } err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work); if (err) - goto out_err; + return err; if (!ofs->workdir) sb->s_flags |= SB_RDONLY; @@ -1478,7 +1455,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) oe = ovl_get_lowerstack(sb, ctx, ofs, layers); err = PTR_ERR(oe); if (IS_ERR(oe)) - goto out_err; + return err; /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ovl_upper_mnt(ofs)) @@ -1531,7 +1508,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ovl_export_fid_operations; /* Never override disk quota limits or use reserved space */ - cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); + cap_lower(creator_cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_xattr = ovl_xattr_handlers(ofs); @@ -1549,27 +1526,44 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED; err = -ENOMEM; - root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); - if (!root_dentry) + sb->s_root = ovl_get_root(sb, ctx->upper.dentry, oe); + if (!sb->s_root) goto out_free_oe; - sb->s_root = root_dentry; - - ovl_revert_creds(old_cred); return 0; out_free_oe: ovl_free_entry(oe); + return err; +} + +int ovl_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct ovl_fs *ofs = sb->s_fs_info; + int err; + + err = -EIO; + if (WARN_ON(fc->user_ns != current_user_ns())) + goto out_err; + + ovl_set_d_op(sb); + + if (!ofs->creator_cred) { + err = -ENOMEM; + ofs->creator_cred = prepare_creds(); + if (!ofs->creator_cred) + goto out_err; + } + + with_ovl_creds(sb) + err = ovl_fill_super_creds(fc, sb); + out_err: - /* - * Revert creds before calling ovl_free_fs() which will call - * put_cred() and put_cred() requires that the cred's that are - * put are not the caller's creds, i.e., current->cred. - */ - if (old_cred) - ovl_revert_creds(old_cred); - ovl_free_fs(ofs); - sb->s_fs_info = NULL; + if (err) { + ovl_free_fs(ofs); + sb->s_fs_info = NULL; + } + return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f76672f2e686..94986d11a166 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -69,11 +69,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -void ovl_revert_creds(const struct cred *old_cred) -{ - revert_creds(old_cred); -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -1019,8 +1014,8 @@ bool ovl_inuse_trylock(struct dentry *dentry) bool locked = false; spin_lock(&inode->i_lock); - if (!(inode->i_state & I_OVL_INUSE)) { - inode->i_state |= I_OVL_INUSE; + if (!(inode_state_read(inode) & I_OVL_INUSE)) { + inode_state_set(inode, I_OVL_INUSE); locked = true; } spin_unlock(&inode->i_lock); @@ -1034,8 +1029,8 @@ void ovl_inuse_unlock(struct dentry *dentry) struct inode *inode = d_inode(dentry); spin_lock(&inode->i_lock); - WARN_ON(!(inode->i_state & I_OVL_INUSE)); - inode->i_state &= ~I_OVL_INUSE; + WARN_ON(!(inode_state_read(inode) & I_OVL_INUSE)); + inode_state_clear(inode, I_OVL_INUSE); spin_unlock(&inode->i_lock); } } @@ -1046,7 +1041,7 @@ bool ovl_is_inuse(struct dentry *dentry) bool inuse; spin_lock(&inode->i_lock); - inuse = (inode->i_state & I_OVL_INUSE); + inuse = (inode_state_read(inode) & I_OVL_INUSE); spin_unlock(&inode->i_lock); return inuse; @@ -1147,7 +1142,6 @@ fail: int ovl_nlink_start(struct dentry *dentry) { struct inode *inode = d_inode(dentry); - const struct cred *old_cred; int err; if (WARN_ON(!inode)) @@ -1184,15 +1178,14 @@ int ovl_nlink_start(struct dentry *dentry) if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) return 0; - old_cred = ovl_override_creds(dentry->d_sb); /* * The overlay inode nlink should be incremented/decremented IFF the * upper operation succeeds, along with nlink change of upper inode. * Therefore, before link/unlink/rename, we store the union nlink * value relative to the upper inode nlink in an upper inode xattr. */ - err = ovl_set_nlink_upper(dentry); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = ovl_set_nlink_upper(dentry); if (err) goto out_drop_write; @@ -1213,11 +1206,8 @@ void ovl_nlink_end(struct dentry *dentry) ovl_drop_write(dentry); if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - ovl_cleanup_index(dentry); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + ovl_cleanup_index(dentry); } ovl_inode_unlock(inode); @@ -1234,9 +1224,9 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work, goto err; if (trap) goto err_unlock; - if (work && work->d_parent != workdir) + if (work && (work->d_parent != workdir || d_unhashed(work))) goto err_unlock; - if (upper && upper->d_parent != upperdir) + if (upper && (upper->d_parent != upperdir || d_unhashed(upper))) goto err_unlock; return 0; @@ -1548,14 +1538,3 @@ void ovl_copyattr(struct inode *inode) i_size_write(inode, i_size_read(realinode)); spin_unlock(&inode->i_lock); } - -int ovl_parent_lock(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - if (!child || - (!d_unhashed(child) && child->d_parent == parent)) - return 0; - - inode_unlock(parent->d_inode); - return -EINVAL; -} diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c index 88055deca936..aa95855c7023 100644 --- a/fs/overlayfs/xattrs.c +++ b/fs/overlayfs/xattrs.c @@ -41,13 +41,11 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); struct path realpath; - const struct cred *old_cred; if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); if (err < 0) goto out; } @@ -64,15 +62,14 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char if (err) goto out; - old_cred = ovl_override_creds(dentry->d_sb); - if (value) { - err = ovl_do_setxattr(ofs, realdentry, name, value, size, - flags); - } else { - WARN_ON(flags != XATTR_REPLACE); - err = ovl_do_removexattr(ofs, realdentry, name); + with_ovl_creds(dentry->d_sb) { + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, flags); + } else { + WARN_ON(flags != XATTR_REPLACE); + err = ovl_do_removexattr(ofs, realdentry, name); + } } - ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ @@ -84,15 +81,11 @@ out: static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - ssize_t res; - const struct cred *old_cred; struct path realpath; ovl_i_path_real(inode, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); - ovl_revert_creds(old_cred); - return res; + with_ovl_creds(dentry->d_sb) + return vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); } static bool ovl_can_list(struct super_block *sb, const char *s) @@ -116,12 +109,10 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) ssize_t res; size_t len; char *s; - const struct cred *old_cred; size_t prefix_len, name_len; - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_listxattr(realdentry, list, size); - ovl_revert_creds(old_cred); + with_ovl_creds(dentry->d_sb) + res = vfs_listxattr(realdentry, list, size); if (res <= 0 || size == 0) return res; diff --git a/fs/pidfs.c b/fs/pidfs.c index 0ef5b47d796a..dba703d4ce4a 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -39,20 +39,20 @@ void pidfs_get_root(struct path *path) path_get(path); } -/* - * Stashes information that userspace needs to access even after the - * process has been reaped. - */ -struct pidfs_exit_info { - __u64 cgroupid; - __s32 exit_code; - __u32 coredump_mask; +enum pidfs_attr_mask_bits { + PIDFS_ATTR_BIT_EXIT = 0, + PIDFS_ATTR_BIT_COREDUMP = 1, }; struct pidfs_attr { + unsigned long attr_mask; struct simple_xattrs *xattrs; - struct pidfs_exit_info __pei; - struct pidfs_exit_info *exit_info; + struct /* exit info */ { + __u64 cgroupid; + __s32 exit_code; + }; + __u32 coredump_mask; + __u32 coredump_signal; }; static struct rb_root pidfs_ino_tree = RB_ROOT; @@ -293,6 +293,15 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) return 0; } +/* This must be updated whenever a new flag is added */ +#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \ + PIDFD_INFO_CREDS | \ + PIDFD_INFO_CGROUPID | \ + PIDFD_INFO_EXIT | \ + PIDFD_INFO_COREDUMP | \ + PIDFD_INFO_SUPPORTED_MASK | \ + PIDFD_INFO_COREDUMP_SIGNAL) + static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; @@ -300,12 +309,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; - struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; struct pidfs_attr *attr; const struct cred *c; __u64 mask; + BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2); + if (!uinfo) return -EINVAL; if (usize < PIDFD_INFO_SIZE_VER0) @@ -323,20 +333,24 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) attr = READ_ONCE(pid->attr); if (mask & PIDFD_INFO_EXIT) { - exit_info = READ_ONCE(attr->exit_info); - if (exit_info) { + if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) { + smp_rmb(); kinfo.mask |= PIDFD_INFO_EXIT; #ifdef CONFIG_CGROUPS - kinfo.cgroupid = exit_info->cgroupid; + kinfo.cgroupid = attr->cgroupid; kinfo.mask |= PIDFD_INFO_CGROUPID; #endif - kinfo.exit_code = exit_info->exit_code; + kinfo.exit_code = attr->exit_code; } } if (mask & PIDFD_INFO_COREDUMP) { - kinfo.mask |= PIDFD_INFO_COREDUMP; - kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask); + if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { + smp_rmb(); + kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + kinfo.coredump_mask = attr->coredump_mask; + kinfo.coredump_signal = attr->coredump_signal; + } } task = get_pid_task(pid, PIDTYPE_PID); @@ -355,14 +369,15 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if (!c) return -ESRCH; - if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { - task_lock(task); + if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { + guard(task_lock)(task); if (task->mm) { unsigned long flags = __mm_flags_get_dumpable(task->mm); kinfo.coredump_mask = pidfs_coredump_mask(flags); + kinfo.mask |= PIDFD_INFO_COREDUMP; + /* No coredump actually took place, so no coredump signal. */ } - task_unlock(task); } /* Unconditionally return identifiers and credentials, the rest only on request */ @@ -409,6 +424,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) return -ESRCH; copy_out: + if (mask & PIDFD_INFO_SUPPORTED_MASK) { + kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK; + kinfo.supported_mask = PIDFD_INFO_SUPPORTED; + } + + /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */ + WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask); /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that @@ -454,7 +476,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct ns_common *ns_common = NULL; - struct pid_namespace *pid_ns; if (!pidfs_ioctl_valid(cmd)) return -ENOIOCTLCMD; @@ -496,66 +517,64 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: - if (IS_ENABLED(CONFIG_CGROUPS)) { - get_cgroup_ns(nsp->cgroup_ns); - ns_common = to_ns_common(nsp->cgroup_ns); - } + if (!ns_ref_get(nsp->cgroup_ns)) + break; + ns_common = to_ns_common(nsp->cgroup_ns); break; case PIDFD_GET_IPC_NAMESPACE: - if (IS_ENABLED(CONFIG_IPC_NS)) { - get_ipc_ns(nsp->ipc_ns); - ns_common = to_ns_common(nsp->ipc_ns); - } + if (!ns_ref_get(nsp->ipc_ns)) + break; + ns_common = to_ns_common(nsp->ipc_ns); break; case PIDFD_GET_MNT_NAMESPACE: - get_mnt_ns(nsp->mnt_ns); + if (!ns_ref_get(nsp->mnt_ns)) + break; ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: - if (IS_ENABLED(CONFIG_NET_NS)) { - ns_common = to_ns_common(nsp->net_ns); - get_net_ns(ns_common); - } + if (!ns_ref_get(nsp->net_ns)) + break; + ns_common = to_ns_common(nsp->net_ns); break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: - if (IS_ENABLED(CONFIG_PID_NS)) { - get_pid_ns(nsp->pid_ns_for_children); - ns_common = to_ns_common(nsp->pid_ns_for_children); - } + if (!ns_ref_get(nsp->pid_ns_for_children)) + break; + ns_common = to_ns_common(nsp->pid_ns_for_children); break; case PIDFD_GET_TIME_NAMESPACE: - if (IS_ENABLED(CONFIG_TIME_NS)) { - get_time_ns(nsp->time_ns); - ns_common = to_ns_common(nsp->time_ns); - } + if (!ns_ref_get(nsp->time_ns)) + break; + ns_common = to_ns_common(nsp->time_ns); break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: - if (IS_ENABLED(CONFIG_TIME_NS)) { - get_time_ns(nsp->time_ns_for_children); - ns_common = to_ns_common(nsp->time_ns_for_children); - } + if (!ns_ref_get(nsp->time_ns_for_children)) + break; + ns_common = to_ns_common(nsp->time_ns_for_children); break; case PIDFD_GET_UTS_NAMESPACE: - if (IS_ENABLED(CONFIG_UTS_NS)) { - get_uts_ns(nsp->uts_ns); - ns_common = to_ns_common(nsp->uts_ns); - } + if (!ns_ref_get(nsp->uts_ns)) + break; + ns_common = to_ns_common(nsp->uts_ns); break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: - if (IS_ENABLED(CONFIG_USER_NS)) { - rcu_read_lock(); - ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); - rcu_read_unlock(); + scoped_guard(rcu) { + struct user_namespace *user_ns; + + user_ns = task_cred_xxx(task, user_ns); + if (!ns_ref_get(user_ns)) + break; + ns_common = to_ns_common(user_ns); } break; case PIDFD_GET_PID_NAMESPACE: - if (IS_ENABLED(CONFIG_PID_NS)) { - rcu_read_lock(); + scoped_guard(rcu) { + struct pid_namespace *pid_ns; + pid_ns = task_active_pid_ns(task); - if (pid_ns) - ns_common = to_ns_common(get_pid_ns(pid_ns)); - rcu_read_unlock(); + if (!ns_ref_get(pid_ns)) + break; + ns_common = to_ns_common(pid_ns); } break; default: @@ -606,24 +625,25 @@ void pidfs_exit(struct task_struct *tsk) { struct pid *pid = task_pid(tsk); struct pidfs_attr *attr; - struct pidfs_exit_info *exit_info; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif might_sleep(); - guard(spinlock_irq)(&pid->wait_pidfd.lock); - attr = pid->attr; - if (!attr) { - /* - * No one ever held a pidfd for this struct pid. - * Mark it as dead so no one can add a pidfs - * entry anymore. We're about to be reaped and - * so no exit information would be available. - */ - pid->attr = PIDFS_PID_DEAD; - return; + /* Synchronize with pidfs_register_pid(). */ + scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { + attr = pid->attr; + if (!attr) { + /* + * No one ever held a pidfd for this struct pid. + * Mark it as dead so no one can add a pidfs + * entry anymore. We're about to be reaped and + * so no exit information would be available. + */ + pid->attr = PIDFS_PID_DEAD; + return; + } } /* @@ -634,41 +654,39 @@ void pidfs_exit(struct task_struct *tsk) * is put */ - exit_info = &attr->__pei; - #ifdef CONFIG_CGROUPS rcu_read_lock(); cgrp = task_dfl_cgroup(tsk); - exit_info->cgroupid = cgroup_id(cgrp); + attr->cgroupid = cgroup_id(cgrp); rcu_read_unlock(); #endif - exit_info->exit_code = tsk->exit_code; + attr->exit_code = tsk->exit_code; /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ - smp_store_release(&attr->exit_info, &attr->__pei); + smp_wmb(); + set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask); } #ifdef CONFIG_COREDUMP void pidfs_coredump(const struct coredump_params *cprm) { struct pid *pid = cprm->pid; - struct pidfs_exit_info *exit_info; struct pidfs_attr *attr; - __u32 coredump_mask = 0; attr = READ_ONCE(pid->attr); VFS_WARN_ON_ONCE(!attr); VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); - exit_info = &attr->__pei; - /* Note how we were coredumped. */ - coredump_mask = pidfs_coredump_mask(cprm->mm_flags); - /* Note that we actually did coredump. */ - coredump_mask |= PIDFD_COREDUMPED; + /* Note how we were coredumped and that we coredumped. */ + attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) | + PIDFD_COREDUMPED; /* If coredumping is set to skip we should never end up here. */ - VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP); - smp_store_release(&exit_info->coredump_mask, coredump_mask); + VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); + /* Expose the signal number that caused the coredump. */ + attr->coredump_signal = cprm->siginfo->si_signo; + smp_wmb(); + set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); } #endif @@ -1022,6 +1040,7 @@ static int pidfs_init_fs_context(struct fs_context *fc) fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; + ctx->s_d_flags |= DCACHE_DONTCACHE; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; diff --git a/fs/pipe.c b/fs/pipe.c index 42fead1efe52..2d0fed2ecbfd 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -908,7 +908,7 @@ static struct inode * get_pipe_inode(void) * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ - inode->i_state = I_DIRTY; + inode_state_assign_raw(inode, I_DIRTY); inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4050942ab52f..768f027c1428 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1091,7 +1091,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1141,7 +1141,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -1212,7 +1212,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, int acl_type; int error; struct inode *inode = d_inode(dentry); - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; acl_type = posix_acl_type(acl_name); if (acl_type < 0) @@ -1249,7 +1249,7 @@ retry_deleg: out_inode_unlock: inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/proc/array.c b/fs/proc/array.c index 2ae63189091e..cbd4bc4a58e4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -481,7 +481,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unsigned long flags; int exit_code = task->exit_code; struct signal_struct *sig = task->signal; - unsigned int seq = 1; state = *get_task_state(task); vsize = eip = esp = 0; @@ -538,10 +537,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (permitted && (!whole || num_threads < 2)) wchan = !task_is_running(task); - do { - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { cmin_flt = sig->cmin_flt; cmaj_flt = sig->cmaj_flt; cutime = sig->cutime; @@ -563,8 +559,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } rcu_read_unlock(); } - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + } if (whole) { thread_group_cputime_adjusted(task, &utime, &stime); diff --git a/fs/proc/base.c b/fs/proc/base.c index 6299878e3d97..407b41cb6e7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3043,21 +3043,14 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (whole) { struct signal_struct *sig = task->signal; struct task_struct *t; - unsigned int seq = 1; - unsigned long flags; - - rcu_read_lock(); - do { - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { acct = sig->ioac; __for_each_thread(sig, t) task_io_accounting_add(&acct, &t->ioac); - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } else { acct = task->ioac; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 176281112273..501889856461 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde) } } +static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) +{ + rb_erase(&pde->subdir_node, &parent->subdir); + RB_CLEAR_NODE(&pde->subdir_node); +} + /* * Remove a /proc entry and free it if it's not currently in use. */ @@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { - rb_erase(&de->subdir_node, &parent->subdir); + pde_erase(de, parent); if (S_ISDIR(de->mode)) parent->nlink--; } @@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) root->parent->name, root->name); return -EINVAL; } - rb_erase(&root->subdir_node, &parent->subdir); + pde_erase(root, parent); de = root; while (1) { @@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) next->parent->name, next->name); return -EINVAL; } - rb_erase(&next->subdir_node, &de->subdir); + pde_erase(next, de); de = next; continue; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e399e2dd3a12..31d78da203ea 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -290,7 +290,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; qnx4_inode = qnx4_raw_inode(inode); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 3310d1ad4d0e..88d285005083 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -521,7 +521,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ei = QNX6_I(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6c4a6ee1fa2b..376739f6420e 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1033,7 +1033,7 @@ static int add_dquot_ref(struct super_block *sb, int type) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0d0ef54fc4de..b2d178d3556e 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -24,7 +24,8 @@ #include "internal.h" struct rdt_parse_data { - struct rdtgroup *rdtgrp; + u32 closid; + enum rdtgrp_mode mode; char *buf; }; @@ -77,8 +78,8 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 bw_val; cfg = &d->staged_config[s->conf_type]; @@ -156,9 +157,10 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { - struct rdtgroup *rdtgrp = data->rdtgrp; + enum rdtgrp_mode mode = data->mode; struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 cbm_val; cfg = &d->staged_config[s->conf_type]; @@ -171,7 +173,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * Cannot set up more than one pseudo-locked region in a cache * hierarchy. */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + if (mode == RDT_MODE_PSEUDO_LOCKSETUP && rdtgroup_pseudo_locked_in_hierarchy(d)) { rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); return -EINVAL; @@ -180,8 +182,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && + if ((mode == RDT_MODE_EXCLUSIVE || mode == RDT_MODE_SHAREABLE) && rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); return -EINVAL; @@ -191,14 +192,14 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * The CBM may not overlap with the CBM of another closid if * either is exclusive. */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, true)) { rdt_last_cmd_puts("Overlaps with exclusive group\n"); return -EINVAL; } - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, false)) { + if (mode == RDT_MODE_EXCLUSIVE || + mode == RDT_MODE_PSEUDO_LOCKSETUP) { rdt_last_cmd_puts("Overlaps with other group\n"); return -EINVAL; } @@ -262,7 +263,8 @@ next: list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { data.buf = dom; - data.rdtgrp = rdtgrp; + data.closid = rdtgrp->closid; + data.mode = rdtgrp->mode; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -381,7 +383,8 @@ out: return ret ?: nbytes; } -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, + char *resource_name, int closid) { struct rdt_resource *r = schema->res; struct rdt_ctrl_domain *dom; @@ -391,7 +394,8 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - seq_printf(s, "%*s:", max_name_width, schema->name); + if (resource_name) + seq_printf(s, "%*s:", max_name_width, resource_name); list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -437,7 +441,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, closid = rdtgrp->closid; list_for_each_entry(schema, &resctrl_schema_all, list) { if (closid < schema->num_closid) - show_doms(s, schema, closid); + show_doms(s, schema, schema->name, closid); } } } else { @@ -676,3 +680,280 @@ out: rdtgroup_kn_unlock(of->kn); return ret; } + +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + mutex_lock(&rdtgroup_mutex); + + if (r->cache.io_alloc_capable) { + if (resctrl_arch_get_io_alloc_enabled(r)) + seq_puts(seq, "enabled\n"); + else + seq_puts(seq, "disabled\n"); + } else { + seq_puts(seq, "not supported\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * resctrl_io_alloc_closid_supported() - io_alloc feature utilizes the + * highest CLOSID value to direct I/O traffic. Ensure that io_alloc_closid + * is in the supported range. + */ +static bool resctrl_io_alloc_closid_supported(u32 io_alloc_closid) +{ + return io_alloc_closid < closids_supported(); +} + +/* + * Initialize io_alloc CLOSID cache resource CBM with all usable (shared + * and unused) cache portions. + */ +static int resctrl_io_alloc_init_cbm(struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *d; + int ret; + + rdt_staged_configs_clear(); + + ret = rdtgroup_init_cat(s, closid); + if (ret < 0) + goto out; + + /* Keep CDP_CODE and CDP_DATA of io_alloc CLOSID's CBM in sync. */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + + ret = resctrl_arch_update_domains(r, closid); +out: + rdt_staged_configs_clear(); + return ret; +} + +/* + * resctrl_io_alloc_closid() - io_alloc feature routes I/O traffic using + * the highest available CLOSID. Retrieve the maximum CLOSID supported by the + * resource. Note that if Code Data Prioritization (CDP) is enabled, the number + * of available CLOSIDs is reduced by half. + */ +u32 resctrl_io_alloc_closid(struct rdt_resource *r) +{ + if (resctrl_arch_get_cdp_enabled(r->rid)) + return resctrl_arch_get_num_closid(r) / 2 - 1; + else + return resctrl_arch_get_num_closid(r) - 1; +} + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + char const *grp_name; + u32 io_alloc_closid; + bool enable; + int ret; + + ret = kstrtobool(buf, &enable); + if (ret) + return ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + /* If the feature is already up to date, no action is needed. */ + if (resctrl_arch_get_io_alloc_enabled(r) == enable) + goto out_unlock; + + io_alloc_closid = resctrl_io_alloc_closid(r); + if (!resctrl_io_alloc_closid_supported(io_alloc_closid)) { + rdt_last_cmd_printf("io_alloc CLOSID (ctrl_hw_id) %u is not available\n", + io_alloc_closid); + ret = -EINVAL; + goto out_unlock; + } + + if (enable) { + if (!closid_alloc_fixed(io_alloc_closid)) { + grp_name = rdtgroup_name_by_closid(io_alloc_closid); + WARN_ON_ONCE(!grp_name); + rdt_last_cmd_printf("CLOSID (ctrl_hw_id) %u for io_alloc is used by %s group\n", + io_alloc_closid, grp_name ? grp_name : "another"); + ret = -ENOSPC; + goto out_unlock; + } + + ret = resctrl_io_alloc_init_cbm(s, io_alloc_closid); + if (ret) { + rdt_last_cmd_puts("Failed to initialize io_alloc allocations\n"); + closid_free(io_alloc_closid); + goto out_unlock; + } + } else { + closid_free(io_alloc_closid); + } + + ret = resctrl_arch_io_alloc_enable(r, enable); + if (enable && ret) { + rdt_last_cmd_puts("Failed to enable io_alloc feature\n"); + closid_free(io_alloc_closid); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + /* + * When CDP is enabled, the CBMs of the highest CLOSID of CDP_CODE and + * CDP_DATA are kept in sync. As a result, the io_alloc CBMs shown for + * either CDP resource are identical and accurately represent the CBMs + * used for I/O. + */ + show_doms(seq, s, NULL, resctrl_io_alloc_closid(r)); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, + struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + +next: + if (!line || line[0] == '\0') + return 0; + + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.mode = RDT_MODE_SHAREABLE; + data.closid = closid; + if (parse_cbm(&data, s, d)) + return -EINVAL; + /* + * Keep io_alloc CLOSID's CBM of CDP_CODE and CDP_DATA + * in sync. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + goto next; + } + } + + return -EINVAL; +} + +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + u32 io_alloc_closid; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + io_alloc_closid = resctrl_io_alloc_closid(r); + + rdt_staged_configs_clear(); + ret = resctrl_io_alloc_parse_line(buf, r, s, io_alloc_closid); + if (ret) + goto out_clear_configs; + + ret = resctrl_arch_update_domains(r, io_alloc_closid); + +out_clear_configs: + rdt_staged_configs_clear(); +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index cf1fd82dc5a9..bff4a54ae333 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -390,6 +390,8 @@ void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); +bool closid_alloc_fixed(u32 closid); + int resctrl_find_cleanest_closid(void); void *rdt_kn_parent_priv(struct kernfs_node *kn); @@ -426,6 +428,21 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid); + +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type); + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +const char *rdtgroup_name_by_closid(u32 closid); +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, + void *v); +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); +u32 resctrl_io_alloc_closid(struct rdt_resource *r); #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0320360cd7a6..8e39dfda56bc 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -226,6 +226,11 @@ bool closid_allocated(unsigned int closid) return !test_bit(closid, closid_free_map); } +bool closid_alloc_fixed(u32 closid) +{ + return __test_and_clear_bit(closid, closid_free_map); +} + /** * rdtgroup_mode_by_closid - Return mode of resource group with closid * @closid: closid if the resource group @@ -1057,15 +1062,17 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(seq, ';'); + hw_shareable = r->cache.shareable_bits; sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->hdr.id); for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) + if (!closid_allocated(i) || + (resctrl_arch_get_io_alloc_enabled(r) && + i == resctrl_io_alloc_closid(r))) continue; ctrl_val = resctrl_arch_get_config(r, dom, i, s->conf_type); @@ -1093,6 +1100,21 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, break; } } + + /* + * When the "io_alloc" feature is enabled, a portion of the cache + * is configured for shared use between hardware and software. + * Also, when CDP is enabled the CBMs of CDP_CODE and CDP_DATA + * resources are kept in sync. So, the CBMs for "io_alloc" can + * be accessed through either resource. + */ + if (resctrl_arch_get_io_alloc_enabled(r)) { + ctrl_val = resctrl_arch_get_config(r, dom, + resctrl_io_alloc_closid(r), + s->conf_type); + hw_shareable |= ctrl_val; + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { pseudo_locked = dom->plr ? dom->plr->cbm : 0; hwb = test_bit(i, &hw_shareable); @@ -1247,7 +1269,7 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, return 0; } -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) { switch (my_type) { case CDP_CODE: @@ -1838,6 +1860,18 @@ void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_ kernfs_put(mon_kn); } +const char *rdtgroup_name_by_closid(u32 closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdt_kn_name(rdtgrp->kn); + } + + return NULL; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1948,6 +1982,20 @@ static struct rftype res_common_files[] = { .seq_show = rdt_thread_throttle_mode_show, }, { + .name = "io_alloc", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_show, + .write = resctrl_io_alloc_write, + }, + { + .name = "io_alloc_cbm", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_cbm_show, + .write = resctrl_io_alloc_cbm_write, + }, + { .name = "max_threshold_occupancy", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -2138,6 +2186,23 @@ static void thread_throttle_mode_init(void) RFTYPE_CTRL_INFO | RFTYPE_RES_MB); } +/* + * The resctrl file "io_alloc" is added using L3 resource. However, it results + * in this file being visible for *all* cache resources (eg. L2 cache), + * whether it supports "io_alloc" or not. + */ +static void io_alloc_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (r->cache.io_alloc_capable) { + resctrl_file_fflags_init("io_alloc", RFTYPE_CTRL_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("io_alloc_cbm", + RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE); + } +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -3383,11 +3448,12 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) { unsigned int cbm_len = r->cache.cbm_len; unsigned long first_bit, zero_bit; - unsigned long val = _val; + unsigned long val; - if (!val) - return 0; + if (!_val || r->cache.arch_has_sparse_bitmasks) + return _val; + val = _val; first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); @@ -3480,7 +3546,7 @@ static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schem * If there are no more shareable bits available on any domain then * the entire allocation will fail. */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { struct rdt_ctrl_domain *d; int ret; @@ -4408,6 +4474,8 @@ int resctrl_init(void) thread_throttle_mode_init(); + io_alloc_init(); + ret = resctrl_mon_resource_init(); if (ret) return ret; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 0addcc849ff2..360b00854115 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -302,7 +302,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) if (!i) return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) + if (!(inode_state_read_once(i) & I_NEW)) return i; /* precalculate the data offset */ diff --git a/fs/select.c b/fs/select.c index 082cf60c7e23..65019b8ba3f7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -776,17 +776,13 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, { // the path is hot enough for overhead of copy_from_user() to matter if (from) { - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(to->p, &from->p, Efault); - unsafe_get_user(to->size, &from->size, Efault); - user_read_access_end(); + scoped_user_read_access(from, Efault) { + unsafe_get_user(to->p, &from->p, Efault); + unsafe_get_user(to->size, &from->size, Efault); + } } return 0; Efault: - user_read_access_end(); return -EFAULT; } diff --git a/fs/signalfd.c b/fs/signalfd.c index d469782f97f4..d69eab584bc6 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -250,8 +250,6 @@ static const struct file_operations signalfd_fops = { static int do_signalfd4(int ufd, sigset_t *mask, int flags) { - struct signalfd_ctx *ctx; - /* Check the SFD_* constants for consistency. */ BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); @@ -263,7 +261,8 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) signotset(mask); if (ufd == -1) { - struct file *file; + int fd; + struct signalfd_ctx *ctx __free(kfree) = NULL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -271,22 +270,16 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) ctx->sigmask = *mask; - ufd = get_unused_fd_flags(flags & O_CLOEXEC); - if (ufd < 0) { - kfree(ctx); - return ufd; - } - - file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, - ctx, O_RDWR | (flags & O_NONBLOCK), - FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(ufd); - kfree(ctx); - return PTR_ERR(file); - } - fd_install(ufd, file); + fd = FD_ADD(flags & O_CLOEXEC, + anon_inode_getfile_fmode( + "[signalfd]", &signalfd_fops, ctx, + O_RDWR | (flags & O_NONBLOCK), FMODE_NOWAIT)); + if (fd >= 0) + retain_and_null_ptr(ctx); + return fd; } else { + struct signalfd_ctx *ctx; + CLASS(fd, f)(ufd); if (fd_empty(f)) return -EBADF; diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b8ac7b7faf61..e3ea6fe7edb4 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); static void cfids_laundromat_worker(struct work_struct *work); +static void close_cached_dir_locked(struct cached_fid *cfid); struct cached_dir_dentry { struct list_head entry; @@ -388,11 +389,11 @@ out: * lease. Release one here, and the second below. */ cfid->has_lease = false; - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir_locked(cfid); } spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); } else { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); @@ -438,12 +439,14 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, static void smb2_close_cached_fid(struct kref *ref) +__releases(&cfid->cfids->cfid_list_lock) { struct cached_fid *cfid = container_of(ref, struct cached_fid, refcount); int rc; - spin_lock(&cfid->cfids->cfid_list_lock); + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; @@ -478,15 +481,49 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon, spin_lock(&cfid->cfids->cfid_list_lock); if (cfid->has_lease) { cfid->has_lease = false; - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir_locked(cfid); } spin_unlock(&cfid->cfids->cfid_list_lock); close_cached_dir(cfid); } - +/** + * close_cached_dir - drop a reference of a cached dir + * + * The release function will be called with cfid_list_lock held to remove the + * cached dirs from the list before any other thread can take another @cfid + * ref. Must not be called with cfid_list_lock held; use + * close_cached_dir_locked() called instead. + * + * @cfid: cached dir + */ void close_cached_dir(struct cached_fid *cfid) { + lockdep_assert_not_held(&cfid->cfids->cfid_list_lock); + kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock); +} + +/** + * close_cached_dir_locked - put a reference of a cached dir with + * cfid_list_lock held + * + * Calling close_cached_dir() with cfid_list_lock held has the potential effect + * of causing a deadlock if the invariant of refcount >= 2 is false. + * + * This function is used in paths that hold cfid_list_lock and expect at least + * two references. If that invariant is violated, WARNs and returns without + * dropping a reference; the final put must still go through + * close_cached_dir(). + * + * @cfid: cached dir + */ +static void close_cached_dir_locked(struct cached_fid *cfid) +{ + lockdep_assert_held(&cfid->cfids->cfid_list_lock); + + if (WARN_ON(kref_read(&cfid->refcount) < 2)) + return; + kref_put(&cfid->refcount, smb2_close_cached_fid); } @@ -596,7 +633,7 @@ cached_dir_offload_close(struct work_struct *work) WARN_ON(cfid->on_list); - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); cifs_put_tcon(tcon, netfs_trace_tcon_ref_put_cached_close); } @@ -762,7 +799,7 @@ static void cfids_laundromat_worker(struct work_struct *work) * Drop the ref-count from above, either the lease-ref (if there * was one) or the extra one acquired. */ - kref_put(&cfid->refcount, smb2_close_cached_fid); + close_cached_dir(cfid); } queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 9891f55bac1e..da935bd1ce87 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -90,7 +90,6 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, size_t desc_len; struct key *spnego_key; const char *hostname = server->hostname; - const struct cred *saved_cred; /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ @@ -158,9 +157,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp += sprintf(dp, ";upcall_target=app"); cifs_dbg(FYI, "key description = %s\n", description); - saved_cred = override_creds(spnego_cred); - spnego_key = request_key(&cifs_spnego_key_type, description, ""); - revert_creds(saved_cred); + scoped_with_creds(spnego_cred) + spnego_key = request_key(&cifs_spnego_key_type, description, ""); #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 185ac41bd7e9..6eccb9ed9daa 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -500,7 +500,7 @@ cifs_evict_inode(struct inode *inode) { netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); - if (inode->i_state & I_PINNING_NETFS_WB) + if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); cifs_fscache_release_inode_cookie(inode); clear_inode(inode); @@ -1149,6 +1149,9 @@ cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* Check if file is oplocked if this is request for new lease */ if (arg == F_UNLCK || ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 7da194f29fef..dcc50a2bfa4b 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1363,6 +1363,14 @@ do_retry: if (rdata->result == -ENODATA) { rdata->result = 0; __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + trace_smb3_read_err(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->subreq.len - rdata->subreq.transferred, + rdata->result); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && @@ -1374,6 +1382,13 @@ do_retry: } if (rdata->got_bytes) __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); + trace_smb3_read_done(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start + rdata->subreq.transferred, + rdata->got_bytes); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, @@ -1445,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata) rdata->iov[1].iov_base = (char *)smb + 4; rdata->iov[1].iov_len = get_rfc1002_length(smb); + trace_smb3_read_enter(rdata->rreq->debug_id, + rdata->subreq.debug_index, + rdata->xid, + rdata->req->cfile->fid.netfid, + tcon->tid, tcon->ses->Suid, + rdata->subreq.start, rdata->subreq.len); + rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive, cifs_readv_callback, NULL, rdata, 0, NULL); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 55cb4b0cbd48..2f94d93b95e9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4451,6 +4451,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) out: kfree(ctx->username); + kfree(ctx->domainname); kfree_sensitive(ctx->password); kfree(origin_fullpath); kfree(ctx); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 474dadeb1593..9dc0a968ec89 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/filelock.h> #include <linux/backing-dev.h> #include <linux/stat.h> diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e60927b2a7c8..2a0d8b87bd8e 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1435,12 +1435,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } + kfree(ctx->source); ctx->source = smb3_fs_context_fullpath(ctx, '/'); if (IS_ERR(ctx->source)) { ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + kfree(fc->source); fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); @@ -1468,7 +1470,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; } - if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) > + if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) == CIFS_MAX_USERNAME_LEN) { pr_warn("username too long\n"); goto cifs_parse_mount_err; @@ -1832,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; + kfree(ctx->source); + ctx->source = NULL; + kfree(fc->source); + fc->source = NULL; return -EINVAL; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cac355364e43..b75482730912 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -6,6 +6,7 @@ * */ #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/pagemap.h> @@ -101,7 +102,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_dbg(FYI, "%s: inode %llu is new\n", __func__, cifs_i->uniqueid); return; @@ -146,7 +147,7 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) */ if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) { /* only provide fake values on a new inode */ - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { if (fattr->cf_cifsattrs & ATTR_DIRECTORY) set_nlink(inode, 2); else @@ -167,12 +168,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - if (!(inode->i_state & I_NEW) && + if (!(inode_state_read_once(inode) & I_NEW) && unlikely(inode_wrong_type(inode, fattr->cf_mode))) { CIFS_I(inode)->time = 0; /* force reval */ return -ESTALE; } - if (inode->i_state & I_NEW) + if (inode_state_read_once(inode) & I_NEW) CIFS_I(inode)->netfs.zero_point = fattr->cf_eof; cifs_revalidate_cache(inode, fattr); @@ -194,7 +195,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, inode->i_gid = fattr->cf_gid; /* if dynperm is set, don't clobber existing mode */ - if (inode->i_state & I_NEW || + if (inode_state_read(inode) & I_NEW || !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) inode->i_mode = fattr->cf_mode; @@ -236,7 +237,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr, if (fattr->cf_flags & CIFS_FATTR_JUNCTION) inode->i_flags |= S_AUTOMOUNT; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { cifs_set_netfs_context(inode); cifs_set_ops(inode); } @@ -1638,7 +1639,7 @@ retry_iget5_locked: cifs_fattr_to_inode(inode, fattr, false); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; - if (inode->i_state & I_NEW) { + if (inode_state_read_once(inode) & I_NEW) { inode->i_ino = hash; cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index ca8f3dd7ff63..78650527d4bb 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -7,6 +7,7 @@ #include <linux/pagemap.h> #include <linux/vfs.h> +#include <linux/fs_struct.h> #include <uapi/linux/magic.h> #include "cifsglob.h" #include "cifsproto.h" diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 09e3fc81d7cb..69cb81fa0d3a 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -1294,6 +1294,8 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, smb2_to_name = cifs_convert_path_to_utf16(to_name, cifs_sb); if (smb2_to_name == NULL) { rc = -ENOMEM; + if (cfile) + cifsFileInfo_put(cfile); goto smb2_rename_path; } in_iov.iov_base = smb2_to_name; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index b0739a2661bf..8b4a4573e9c3 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4054,9 +4054,12 @@ replay_again: smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; - smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), - le32_to_cpu(smb_rsp->OutputBufferLength), &rsp_iov, + rc = smb2_validate_iov(le16_to_cpu(smb_rsp->OutputBufferOffset), + le32_to_cpu(smb_rsp->OutputBufferLength), + &rsp_iov, sizeof(struct file_notify_information)); + if (rc) + goto cnotify_exit; *out_data = kmemdup((char *)smb_rsp + le16_to_cpu(smb_rsp->OutputBufferOffset), le32_to_cpu(smb_rsp->OutputBufferLength), GFP_KERNEL); diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 85a4c55b61b8..c6c428c2e08d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -290,6 +290,9 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 051cd9dbba13..915cedde5d66 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -830,7 +830,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) if (!server || server->terminate) continue; - if (CIFS_CHAN_NEEDS_RECONNECT(ses, i)) + if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur)) continue; /* diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f901ae18e68a..94454e8826b0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -6092,8 +6092,8 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, - &path, 0); + rc = ksmbd_vfs_kern_path_start_removing(work, link_name, LOOKUP_NO_SYMLINKS, + &path, 0); if (rc) { if (rc != -ENOENT) goto out; @@ -6111,7 +6111,7 @@ static int smb2_create_link(struct ksmbd_work *work, ksmbd_debug(SMB, "link already exists\n"); goto out; } - ksmbd_vfs_kern_path_unlock(&path); + ksmbd_vfs_kern_path_end_removing(&path); } rc = ksmbd_vfs_link(work, target_name, link_name); if (rc) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 7d86553fcc7c..e2be9a496154 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -334,6 +334,9 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) break; case SMBDIRECT_SOCKET_CREATED: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + case SMBDIRECT_SOCKET_CONNECTED: sc->status = SMBDIRECT_SOCKET_ERROR; break; @@ -1883,6 +1886,7 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc) static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) { struct smbdirect_recv_io *recvmsg; + bool recv_posted = false; int ret; WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); @@ -1899,6 +1903,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) pr_err("Can't post recv: %d\n", ret); goto out_err; } + recv_posted = true; ret = smb_direct_accept_client(sc); if (ret) { @@ -1908,7 +1913,14 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) return 0; out_err: - put_recvmsg(sc, recvmsg); + /* + * If the recv was never posted, return it to the free list. + * If it was posted, leave it alone so disconnect teardown can + * drain the QP and complete it (flush) and the completion path + * will unmap it exactly once. + */ + if (!recv_posted) + put_recvmsg(sc, recvmsg); return ret; } @@ -2606,7 +2618,7 @@ void ksmbd_rdma_destroy(void) } } -bool ksmbd_rdma_capable_netdev(struct net_device *netdev) +static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) { struct smb_direct_device *smb_dev; int i; @@ -2648,6 +2660,28 @@ out: return rdma_capable; } +bool ksmbd_rdma_capable_netdev(struct net_device *netdev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + if (ksmbd_find_rdma_capable_netdev(netdev)) + return true; + + /* check if netdev is bridge or VLAN */ + if (netif_is_bridge_master(netdev) || + netdev->priv_flags & IFF_802_1Q_VLAN) + netdev_for_each_lower_dev(netdev, lower_dev, iter) + if (ksmbd_find_rdma_capable_netdev(lower_dev)) + return true; + + /* check if netdev is IPoIB safely without layer violation */ + if (netdev->type == ARPHRD_INFINIBAND) + return true; + + return false; +} + static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7a1e3dcc2cde..d2e391c29464 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -290,8 +290,11 @@ static int ksmbd_kthread_fn(void *p) } } up_read(&conn_list_lock); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + /* Per-IP limit hit: release the just-accepted socket. */ + sock_release(client_sk); continue; + } skip_max_ip_conns_limit: if (server_conf.max_connections && diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 891ed2dc2b73..03fd7409be79 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -49,27 +49,9 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, i_uid_write(inode, i_uid_read(parent_inode)); } -/** - * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable - * @parent: parent dentry - * @child: child dentry - * - * Returns: %0 on success, %-ENOENT if the parent dentry is not stable - */ -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) -{ - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - if (child->d_parent != parent) { - inode_unlock(d_inode(parent)); - return -ENOENT; - } - - return 0; -} - static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, char *pathname, unsigned int flags, - struct path *path, bool do_lock) + struct path *path, bool for_remove) { struct qstr last; struct filename *filename __free(putname) = NULL; @@ -99,22 +81,20 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, return -ENOENT; } - if (do_lock) { + if (for_remove) { err = mnt_want_write(path->mnt); if (err) { path_put(path); return -ENOENT; } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, path->dentry, 0); + d = start_removing_noperm(path->dentry, &last); if (!IS_ERR(d)) { dput(path->dentry); path->dentry = d; return 0; } - inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); return -ENOENT; @@ -188,8 +168,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } mode |= S_IFREG; - err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), - dentry, mode, true); + err = vfs_create(mnt_idmap(path.mnt), dentry, mode, NULL); if (!err) { ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); @@ -230,7 +209,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; d = dentry; - dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode, NULL); if (IS_ERR(dentry)) err = PTR_ERR(dentry); else if (d_is_negative(dentry)) @@ -609,7 +588,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { - err = vfs_rmdir(idmap, d_inode(parent), path->dentry); + err = vfs_rmdir(idmap, d_inode(parent), path->dentry, NULL); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "rmdir failed, err %d\n", err); } else { @@ -681,7 +660,6 @@ out1: int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, char *newname, int flags) { - struct dentry *old_parent, *new_dentry, *trap; struct dentry *old_child = old_path->dentry; struct path new_path; struct qstr new_last; @@ -691,7 +669,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; - int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -702,14 +679,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if (flags & RENAME_NOREPLACE) - target_lookup_flags |= LOOKUP_EXCL; - flags &= ~(RENAME_NOREPLACE); - retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -726,17 +695,14 @@ retry: if (err) goto out2; - trap = lock_rename_child(old_child, new_path.dentry); - if (IS_ERR(trap)) { - err = PTR_ERR(trap); + rd.mnt_idmap = mnt_idmap(old_path->mnt); + rd.old_parent = NULL; + rd.new_parent = new_path.dentry; + rd.flags = flags; + rd.delegated_inode = NULL, + err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last); + if (err) goto out_drop_write; - } - - old_parent = dget(old_child->d_parent); - if (d_unhashed(old_child)) { - err = -EINVAL; - goto out3; - } parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent); if (parent_fp) { @@ -749,44 +715,17 @@ retry: ksmbd_fd_put(work, parent_fp); } - new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | target_lookup_flags); - if (IS_ERR(new_dentry)) { - err = PTR_ERR(new_dentry); - goto out3; - } - - if (d_is_symlink(new_dentry)) { + if (d_is_symlink(rd.new_dentry)) { err = -EACCES; - goto out4; - } - - if (old_child == trap) { - err = -EINVAL; - goto out4; - } - - if (new_dentry == trap) { - err = -ENOTEMPTY; - goto out4; + goto out3; } - rd.mnt_idmap = mnt_idmap(old_path->mnt), - rd.old_parent = old_parent, - rd.old_dentry = old_child, - rd.new_parent = new_path.dentry, - rd.new_dentry = new_dentry, - rd.flags = flags, - rd.delegated_inode = NULL, err = vfs_rename(&rd); if (err) ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); -out4: - dput(new_dentry); out3: - dput(old_parent); - unlock_rename(old_parent, new_path.dentry); + end_renaming(&rd); out_drop_write: mnt_drop_write(old_path->mnt); out2: @@ -1084,18 +1023,17 @@ int ksmbd_vfs_unlink(struct file *filp) return err; dir = dget_parent(dentry); - err = ksmbd_vfs_lock_parent(dir, dentry); - if (err) + dentry = start_removing_dentry(dir, dentry); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) goto out; - dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) - err = vfs_rmdir(idmap, d_inode(dir), dentry); + err = vfs_rmdir(idmap, d_inode(dir), dentry, NULL); else err = vfs_unlink(idmap, d_inode(dir), dentry, NULL); - dput(dentry); - inode_unlock(d_inode(dir)); + end_removing(dentry); if (err) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: @@ -1207,7 +1145,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, static int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, unsigned int flags, - struct path *path, bool caseless, bool do_lock) + struct path *path, bool caseless, bool for_remove) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; struct path parent_path; @@ -1215,7 +1153,7 @@ int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, int err; retry: - err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock); + err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, for_remove); if (!err || !caseless) return err; @@ -1286,7 +1224,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, } /** - * ksmbd_vfs_kern_path_locked() - lookup a file and get path info + * ksmbd_vfs_kern_path_start_remove() - lookup a file and get path info prior to removal * @work: work * @filepath: file path that is relative to share * @flags: lookup flags @@ -1298,20 +1236,19 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath, * filesystem will have been gained. * Return: 0 on if file was found, otherwise error */ -int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath, - unsigned int flags, - struct path *path, bool caseless) +int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *filepath, + unsigned int flags, + struct path *path, bool caseless) { return __ksmbd_vfs_kern_path(work, filepath, flags, path, caseless, true); } -void ksmbd_vfs_kern_path_unlock(const struct path *path) +void ksmbd_vfs_kern_path_end_removing(const struct path *path) { - /* While lock is still held, ->d_parent is safe */ - inode_unlock(d_inode(path->dentry->d_parent)); + end_removing(path->dentry); mnt_drop_write(path->mnt); - path_put(path); + mntput(path->mnt); } struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index df6421b4590b..16ca29ee16e5 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -120,10 +120,10 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); -int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, - unsigned int flags, - struct path *path, bool caseless); -void ksmbd_vfs_kern_path_unlock(const struct path *path); +int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *name, + unsigned int flags, + struct path *path, bool caseless); +void ksmbd_vfs_kern_path_end_removing(const struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, diff --git a/fs/splice.c b/fs/splice.c index f5094b6d00a0..d338fe56b50b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * For lack of a better implementation, implement vmsplice() to userspace - * as a simple copy of the pipes pages to the user iov. + * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index cceae3b78698..82b687414e65 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -86,7 +86,7 @@ struct inode *squashfs_iget(struct super_block *sb, long long ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); diff --git a/fs/super.c b/fs/super.c index 5bab94fb7e03..7c66b96b59be 100644 --- a/fs/super.c +++ b/fs/super.c @@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; return s; fail: @@ -1183,11 +1184,14 @@ static inline bool get_active_super(struct super_block *sb) static const char *filesystems_freeze_ptr = "filesystems_freeze"; -static void filesystems_freeze_callback(struct super_block *sb, void *unused) +static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr) { if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; + if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + return; + if (!get_active_super(sb)) return; @@ -1201,9 +1205,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused) deactivate_super(sb); } -void filesystems_freeze(void) +void filesystems_freeze(bool freeze_all) { - __iterate_supers(filesystems_freeze_callback, NULL, + void *freeze_all_ptr = NULL; + + if (freeze_all) + freeze_all_ptr = &freeze_all; + __iterate_supers(filesystems_freeze_callback, freeze_all_ptr, SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); } diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..431fc5f5be06 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -117,16 +117,17 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { int nowait = 0; + int wait = 1; /* * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); - iterate_supers(sync_inodes_one_sb, &nowait); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); + iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); @@ -182,7 +183,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) if (!file->f_op->fsync) return -EINVAL; - if (!datasync && (inode->i_state & I_DIRTY_TIME)) + if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); return file->f_op->fsync(file, start, end, datasync); } @@ -280,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { - int sync_mode = WB_SYNC_NONE; - if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == SYNC_FILE_RANGE_WRITE_AND_WAIT) - sync_mode = WB_SYNC_ALL; - - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - sync_mode); + ret = filemap_fdatawrite_range(mapping, offset, + endbyte); + else + ret = filemap_flush_range(mapping, offset, endbyte); if (ret < 0) goto out; } diff --git a/fs/timerfd.c b/fs/timerfd.c index c68f28d9c426..9fcea7860ddf 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -393,9 +393,8 @@ static const struct file_operations timerfd_fops = { SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { - int ufd; - struct timerfd_ctx *ctx; - struct file *file; + struct timerfd_ctx *ctx __free(kfree) = NULL; + int ret; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -432,23 +431,13 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ctx->moffs = ktime_mono_to_real(0); - ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS); - if (ufd < 0) { - kfree(ctx); - return ufd; - } - - file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), - FMODE_NOWAIT); - if (IS_ERR(file)) { - put_unused_fd(ufd); - kfree(ctx); - return PTR_ERR(file); - } - - fd_install(ufd, file); - return ufd; + ret = FD_ADD(flags & TFD_SHARED_FCNTL_FLAGS, + anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT)); + if (ret >= 0) + retain_and_null_ptr(ctx); + return ret; } static int do_timerfd_settime(int ufd, int flags, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index ca41ce8208c4..c3265b8804f5 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1323,7 +1323,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ - if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + if (!datasync || (inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 46952a33c4e6..f453c37cee37 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -114,7 +114,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ui = ubifs_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a79d73f28aa7..7fae8002344a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1962,7 +1962,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 8361c00e8fa6..e2b0a35de2a7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -655,7 +655,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; ufsi = UFS_I(inode); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 54c6cc7fe9c6..e6e74b384087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2111,9 +2111,7 @@ static void init_once_userfaultfd_ctx(void *mem) static int new_userfaultfd(int flags) { - struct userfaultfd_ctx *ctx; - struct file *file; - int fd; + struct userfaultfd_ctx *ctx __free(kfree) = NULL; VM_WARN_ON_ONCE(!current->mm); @@ -2135,26 +2133,18 @@ static int new_userfaultfd(int flags) atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; - fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (fd < 0) - goto err_out; + FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS, + anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), + NULL)); + if (fdf.err) + return fdf.err; - /* Create a new inode so that the LSM can block the creation. */ - file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); - if (IS_ERR(file)) { - put_unused_fd(fd); - fd = PTR_ERR(file); - goto err_out; - } /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file->f_mode |= FMODE_NOWAIT; - fd_install(fd, file); - return fd; -err_out: - kmem_cache_free(userfaultfd_ctx_cachep, ctx); - return fd; + fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT; + retain_and_null_ptr(ctx); + return fd_publish(fdf); } static inline bool userfaultfd_syscall_allowed(int flags) diff --git a/fs/utimes.c b/fs/utimes.c index c7c7958e57b2..86f8ce8cd6b1 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -22,7 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; if (times) { if (!nsec_valid(times[0].tv_nsec) || @@ -66,7 +66,7 @@ retry_deleg: error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -76,6 +76,7 @@ retry_deleg: out: return error; } +EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) diff --git a/fs/xattr.c b/fs/xattr.c index 8851a5ef34f5..32d445fb60aa 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -274,7 +274,7 @@ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, - int flags, struct inode **delegated_inode) + int flags, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -305,7 +305,7 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; const void *orig_value = value; int error; @@ -322,7 +322,7 @@ retry_deleg: flags, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; @@ -533,7 +533,7 @@ EXPORT_SYMBOL(__vfs_removexattr); int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, - struct inode **delegated_inode) + struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -567,7 +567,7 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; - struct inode *delegated_inode = NULL; + struct delegated_inode delegated_inode = { }; int error; retry_deleg: @@ -576,7 +576,7 @@ retry_deleg: name, &delegated_inode); inode_unlock(inode); - if (delegated_inode) { + if (is_delegated(&delegated_inode)) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index de840abc0bcd..57e47077c75a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -73,7 +73,8 @@ #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 -#define XFS_ERRTAG_MAX 46 +#define XFS_ERRTAG_FORCE_ZERO_RANGE 46 +#define XFS_ERRTAG_MAX 47 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ -XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ +XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) #endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 2ef7742be7d3..7bfa37c99480 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1249,7 +1249,7 @@ xchk_irele( * hits do not clear DONTCACHE, so we must do it here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index a90a011c7e5f..4f7040c9ddf0 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1933,7 +1933,7 @@ xrep_inode_pptr( * Unlinked inodes that cannot be added to the directory tree will not * have a parent pointer. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return 0; /* Children of the superblock do not have parent pointers. */ diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 9c12cb844231..4e550a1d5353 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -152,11 +152,10 @@ xrep_orphanage_create( } /* Try to find the orphanage directory. */ - inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); + orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE)); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); - goto out_unlock_root; + goto out_dput_root; } /* @@ -167,10 +166,10 @@ xrep_orphanage_create( */ if (d_really_is_negative(orphanage_dentry)) { orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, - orphanage_dentry, 0750); + orphanage_dentry, 0750, NULL); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) - goto out_unlock_root; + goto out_dput_orphanage; } /* Not a directory? Bail out. */ @@ -200,9 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - dput(orphanage_dentry); -out_unlock_root: - inode_unlock(VFS_I(sc->mp->m_rootip)); + end_creating(orphanage_dentry); out_dput_root: dput(root_dentry); out: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 3b692c4acc1e..11d5de10fd56 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -915,7 +915,7 @@ xchk_pptr_looks_zapped( * Temporary files that cannot be linked into the directory tree do not * have attr forks because they cannot ever have parents. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return false; /* diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 5902398185a8..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,7 +184,7 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index cdd13ed9c569..ed2e8c64b1a8 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -834,7 +834,7 @@ xfarray_sort_scan( si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); - next_pos = folio_pos(si->folio) + folio_size(si->folio); + next_pos = folio_next_pos(si->folio); si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) si->last_folio_idx--; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a26f79815533..56a544638491 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -271,7 +271,7 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio), NULL); + folio_next_pos(folio), NULL); } /* @@ -742,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 06ca11731e43..2208a720ec3f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -514,7 +514,7 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - if (!(VFS_I(ip)->i_state & I_FREEING)) + if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index ee49f20875af..6917de832191 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -726,8 +726,10 @@ xfs_trim_rtgroup_extents( break; } - if (!tr.queued) + if (!tr.queued) { + kfree(tr.extents); break; + } /* * We hand the extent list to the discard function here so the diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2702fef2c90c..6108612182e2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,8 @@ #include "xfs_file.h" #include "xfs_aops.h" #include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -674,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -693,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -890,15 +901,7 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); @@ -1254,23 +1257,36 @@ xfs_falloc_zero_range( struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; - trace_xfs_zero_file_space(XFS_I(inode)); + trace_xfs_zero_file_space(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); - if (error) - return error; + /* + * Zero range implements a full zeroing mechanism but is only used in + * limited situations. It is more efficient to allocate unwritten + * extents than to perform zeroing here, so use an errortag to randomly + * force zeroing on DEBUG kernels for added test coverage. + */ + if (XFS_TEST_ERROR(ip->i_mount, + XFS_ERRTAG_FORCE_ZERO_RANGE)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); + if (error) + return error; - len = round_up(offset + len, blksize) - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(XFS_I(inode), offset, len); + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); + } if (error) return error; return xfs_falloc_setsize(file, new_size); diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index f19fce557354..5a3e3bf4e7cc 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -233,14 +233,11 @@ xfs_open_by_handle( xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); - int error; - int fd; int permflag; - struct file *filp; struct inode *inode; struct dentry *dentry; fmode_t fmode; - struct path path; + struct path path __free(path_put) = {}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -249,12 +246,11 @@ xfs_open_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); + path.dentry = dentry; /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EPERM; #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; @@ -263,48 +259,30 @@ xfs_open_by_handle( permflag = hreq->oflags; fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } + (fmode & FMODE_WRITE) && IS_APPEND(inode)) + return -EPERM; - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) + return -EPERM; /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) + return -EISDIR; - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } + path.mnt = mntget(parfilp->f_path.mnt); - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } + FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred)); + if (fdf.err) + return fdf.err; if (S_ISREG(inode->i_mode)) { + struct file *filp = fd_prepare_file(fdf); + filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; + return fd_publish(fdf); } int diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 7c541fb373d5..3c1557fb1cf0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -285,7 +285,7 @@ xfs_inode_mark_sick( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } @@ -309,7 +309,7 @@ xfs_inode_mark_corrupt( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e44040206851..f3fc4d21bfe1 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -334,7 +334,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; - unsigned long state = inode->i_state; + unsigned long state = inode_state_read_once(inode); error = inode_init_always(mp->m_super, inode); @@ -345,7 +345,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - inode->i_state = state; + inode_state_assign_raw(inode, state); mapping_set_folio_min_order(inode->i_mapping, M_IGEO(mp)->min_folio_order); return error; @@ -411,7 +411,7 @@ xfs_iget_recycle( ip->i_flags |= XFS_INEW; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; + inode_state_assign_raw(inode, I_NEW); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 36b39539e561..f1f88e48fe22 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next( next_ip->i_prev_unlinked = prev_agino; trace_xfs_iunlink_reload_next(next_ip); rele: - ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE)); if (xfs_is_quotacheck_running(mp) && next_ip) xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); xfs_irele(next_ip); @@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout( */ xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); - VFS_I(tmpfile)->i_state |= I_LINKABLE; + inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE); *wip = tmpfile; return 0; @@ -2330,7 +2330,7 @@ retry: * flag from the inode so it doesn't accidentally get misused in * future. */ - VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; + inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE); } out_commit: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 1bd411a1114c..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -113,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a6bb7ee7a27a..59eaad774371 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1408,10 +1408,8 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); - sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &icw); - sb_end_write(mp->m_super); - return error; + guard(super_write)(mp->m_super); + return xfs_blockgc_free_space(mp, &icw); } case XFS_IOC_EXCHANGE_RANGE: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d3f6e3e42a11..04f39ea15898 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1091,6 +1091,29 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { }; #endif /* CONFIG_XFS_RT */ +#ifdef DEBUG +static void +xfs_check_atomic_cow_conversion( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + const struct xfs_bmbt_irec *cmap) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec cmap2 = { }; + + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2)) + xfs_trim_extent(&cmap2, offset_fsb, count_fsb); + + ASSERT(cmap2.br_startoff == cmap->br_startoff); + ASSERT(cmap2.br_blockcount == cmap->br_blockcount); + ASSERT(cmap2.br_startblock == cmap->br_startblock); + ASSERT(cmap2.br_state == cmap->br_state); +} +#else +# define xfs_check_atomic_cow_conversion(...) ((void)0) +#endif + static int xfs_atomic_write_cow_iomap_begin( struct inode *inode, @@ -1102,9 +1125,10 @@ xfs_atomic_write_cow_iomap_begin( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); - xfs_filblks_t count_fsb = end_fsb - offset_fsb; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + const xfs_filblks_t count_fsb = end_fsb - offset_fsb; + xfs_filblks_t hole_count_fsb; int nmaps = 1; xfs_filblks_t resaligned; struct xfs_bmbt_irec cmap; @@ -1130,7 +1154,7 @@ xfs_atomic_write_cow_iomap_begin( return -EAGAIN; trace_xfs_iomap_atomic_write_cow(ip, offset, length); - +retry: xfs_ilock(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { @@ -1141,14 +1165,22 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + + /* + * cmap could extend outside the write range due to previous + * speculative preallocations. We must trim cmap to the write + * range because the cow fork treats written mappings to mean + * "write in progress". + */ xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } - end_fsb = cmap.br_startoff; - count_fsb = end_fsb - offset_fsb; + hole_count_fsb = cmap.br_startoff - offset_fsb; - resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb, xfs_get_cowextsz_hint(ip)); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1169,8 +1201,10 @@ xfs_atomic_write_cow_iomap_begin( if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cmap.br_startoff = end_fsb; if (cmap.br_startoff <= offset_fsb) { - xfs_trim_extent(&cmap, offset_fsb, count_fsb); xfs_trans_cancel(tp); + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + xfs_trim_extent(&cmap, offset_fsb, count_fsb); goto found; } @@ -1182,7 +1216,7 @@ xfs_atomic_write_cow_iomap_begin( * atomic writes to that same range will be aligned (and don't require * this COW-based method). */ - error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); if (error) { @@ -1195,21 +1229,43 @@ xfs_atomic_write_cow_iomap_begin( if (error) goto out_unlock; + /* + * cmap could map more blocks than the range we passed into bmapi_write + * because of EXTSZALIGN or adjacent pre-existing unwritten mappings + * that were merged. Trim cmap to the original write range so that we + * don't convert more than we were asked to do for this write. + */ + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + found: if (cmap.br_state != XFS_EXT_NORM) { - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, - count_fsb); + error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff, + cmap.br_blockcount); if (error) goto out_unlock; cmap.br_state = XFS_EXT_NORM; + xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb, + &cmap); } - length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); - trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); +convert_delay: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap, + NULL); + if (error) + return error; + + /* + * Try the lookup again, because the delalloc conversion might have + * turned the COW mapping into unwritten, but we need it to be in + * written state. + */ + goto retry; out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1702,6 +1758,8 @@ xfs_buffered_write_iomap_begin( struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, + iomap); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1767,21 +1825,41 @@ xfs_buffered_write_iomap_begin( } /* - * For zeroing, trim a delalloc extent that extends beyond the EOF - * block. If it starts beyond the EOF block, convert it to an + * For zeroing, trim extents that extend beyond the EOF block. If a + * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && - isnullstartblock(imap.br_startblock)) { + if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + u64 end; - if (offset_fsb >= eof_fsb) + if (isnullstartblock(imap.br_startblock) && + offset_fsb >= eof_fsb) goto convert_delay; - if (end_fsb > eof_fsb) { + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) end_fsb = eof_fsb; - xfs_trim_extent(&imap, offset_fsb, - end_fsb - offset_fsb); + + /* + * Look up dirty folios for unwritten mappings within EOF. + * Providing this bypasses the flush iomap uses to trigger + * extent conversion when unwritten mappings have dirty + * pagecache in need of zeroing. + * + * Trim the mapping to the end pos of the lookup, which in turn + * was trimmed to the end of the batch if it became full before + * the end of the mapping. + */ + if (imap.br_state == XFS_EXT_UNWRITTEN && + offset_fsb < eof_fsb) { + loff_t len = min(count, + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + end = iomap_fill_dirty_folios(iter, offset, len); + end_fsb = min_t(xfs_fileoff_t, end_fsb, + XFS_B_TO_FSB(mp, end)); } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); } /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index caff0125faea..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1420,7 +1420,7 @@ xfs_setup_inode( bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state |= I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 36cda724da89..9d1ed9bb0bee 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - if ((inode->i_state & I_DIRTY_PAGES) || + if ((inode_state_read_once(inode) & I_DIRTY_PAGES) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&inode->i_dio_count)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1067ebb3b001..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1693,7 +1693,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 040402240807..8dde444596f1 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -615,7 +615,7 @@ xfs_select_open_zone_mru( lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, false)) + if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -1204,6 +1204,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1234,10 +1235,33 @@ xfs_mount_zones( return -ENOMEM; xfs_info(mp, "%u zones of %u blocks (%u max open zones)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { error = blkdev_report_zones(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), @@ -1249,8 +1273,10 @@ xfs_mount_zones( while ((rtg = xfs_rtgroup_next(mp, rtg))) { error = xfs_init_zone(&iz, rtg, NULL); - if (error) + if (error) { + xfs_rtgroup_rele(rtg); goto out_free_zone_info; + } } } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 90e2ad8ee5f4..c1e5e30e90a0 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = { static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_read_iomap_ops); + iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); + return 0; } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_read_iomap_ops); + iomap_bio_readahead(rac, &zonefs_read_iomap_ops); } /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70be0b3dda49..086a31269198 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -644,7 +644,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) { + if (!(inode_state_read_once(inode) & I_NEW)) { WARN_ON_ONCE(inode->i_private != z); return inode; } @@ -683,7 +683,7 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode_state_read_once(inode) & I_NEW)) return inode; inode->i_ino = ino; diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 7146a8e9e9c2..d0eccbd920e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -417,15 +417,32 @@ static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ /* in processor_idle.c */ +extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE -void acpi_processor_power_init(struct acpi_processor *pr); -void acpi_processor_power_exit(struct acpi_processor *pr); +int acpi_processor_power_init(struct acpi_processor *pr); +int acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); -void acpi_processor_register_idle_driver(void); -void acpi_processor_unregister_idle_driver(void); -int acpi_processor_ffh_lpi_probe(unsigned int cpu); -int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); +#else +static inline int acpi_processor_power_init(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_power_exit(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_hotplug(struct acpi_processor *pr) +{ + return -ENODEV; +} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ @@ -448,6 +465,11 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) } #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_ACPI_PROCESSOR_IDLE +extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); +extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); +#endif + void acpi_processor_init_invariance_cppc(void); #endif diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 387720933973..09e8eccee8ed 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -13,10 +13,19 @@ #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) #define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ +#define BUGFLAG_ARGS (1 << 4) #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif +#ifndef WARN_CONDITION_STR +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +# define WARN_CONDITION_STR(cond_str) "[" cond_str "] " +#else +# define WARN_CONDITION_STR(cond_str) +#endif +#endif /* WARN_CONDITION_STR */ + #ifndef __ASSEMBLY__ #include <linux/panic.h> #include <linux/printk.h> @@ -29,19 +38,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #ifdef CONFIG_BUG -#ifdef CONFIG_GENERIC_BUG -struct bug_entry { #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - unsigned long bug_addr; +#define BUG_REL(type, name) type name #else - signed int bug_addr_disp; +#define BUG_REL(type, name) signed int name##_disp #endif -#ifdef CONFIG_DEBUG_BUGVERBOSE -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - const char *file; -#else - signed int file_disp; + +#ifdef CONFIG_GENERIC_BUG +struct bug_entry { + BUG_REL(unsigned long, bug_addr); +#ifdef HAVE_ARCH_BUG_FORMAT + BUG_REL(const char *, format); #endif +#ifdef CONFIG_DEBUG_BUGVERBOSE + BUG_REL(const char *, file); unsigned short line; #endif unsigned short flags; @@ -92,28 +102,50 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#ifndef __WARN_FLAGS -#define __WARN() __WARN_printf(TAINT_WARN, NULL) +#ifdef __WARN_FLAGS +#define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN)) + +#ifndef WARN_ON +#define WARN_ON(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(#condition, \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) +#endif + +#ifndef WARN_ON_ONCE +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(#condition, \ + BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) +#endif +#endif /* __WARN_FLAGS */ + +#if defined(__WARN_FLAGS) && !defined(__WARN_printf) #define __WARN_printf(taint, arg...) do { \ instrumentation_begin(); \ - warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \ + __warn_printk(arg); \ + __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ instrumentation_end(); \ } while (0) -#else -#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) +#endif + +#ifndef __WARN_printf #define __WARN_printf(taint, arg...) do { \ instrumentation_begin(); \ - __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \ instrumentation_end(); \ } while (0) -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ - BUGFLAG_TAINT(TAINT_WARN)); \ - unlikely(__ret_warn_on); \ -}) +#endif + +#ifndef __WARN +#define __WARN() __WARN_printf(TAINT_WARN, NULL) #endif /* used internally by panic.c */ @@ -148,8 +180,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); DO_ONCE_LITE_IF(condition, WARN_ON, 1) #endif +#ifndef WARN_ONCE #define WARN_ONCE(condition, format...) \ DO_ONCE_LITE_IF(condition, WARN, 1, format) +#endif #define WARN_TAINT_ONCE(condition, taint, format...) \ DO_ONCE_LITE_IF(condition, WARN_TAINT, 1, taint, format) diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index ee3793e9b1a4..da1610a78f92 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -45,4 +45,7 @@ # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif +#define TIF_RSEQ 11 // Run RSEQ fast path +#define _TIF_RSEQ BIT(TIF_RSEQ) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e04d56a5332e..a464ff6c1a61 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -87,39 +87,56 @@ #define ALIGN_FUNCTION() . = ALIGN(CONFIG_FUNCTION_ALIGNMENT) /* - * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which - * generates .data.identifier sections, which need to be pulled in with - * .data. We don't want to pull in .data..other sections, which Linux - * has defined. Same for text and bss. + * Support -ffunction-sections by matching .text and .text.*, + * but exclude '.text..*', .text.startup[.*], and .text.exit[.*]. * - * With LTO_CLANG, the linker also splits sections by default, so we need - * these macros to combine the sections during the final link. + * .text.startup and .text.startup.* are matched later by INIT_TEXT, and + * .text.exit and .text.exit.* are matched later by EXIT_TEXT, so they must be + * explicitly excluded here. * - * With AUTOFDO_CLANG and PROPELLER_CLANG, by default, the linker splits - * text sections and regroups functions into subsections. + * Other .text.* sections that are typically grouped separately, such as + * .text.unlikely or .text.hot, must be matched explicitly before using + * TEXT_MAIN. * - * RODATA_MAIN is not used because existing code already defines .rodata.x - * sections to be brought in with rodata. + * NOTE: builds *with* and *without* -ffunction-sections are both supported by + * this single macro. Even with -ffunction-sections, there may be some objects + * NOT compiled with the flag due to the use of a specific Makefile override + * like cflags-y or AUTOFDO_PROFILE_foo.o. So this single catchall rule is + * needed to support mixed object builds. + * + * One implication is that functions named startup(), exit(), split(), + * unlikely(), hot(), and unknown() are not allowed in the kernel due to the + * ambiguity of their section names with -ffunction-sections. For example, + * .text.startup could be __attribute__((constructor)) code in a *non* + * ffunction-sections object, which should be placed in .init.text; or it could + * be an actual function named startup() in an ffunction-sections object, which + * should be placed in .text. The build will detect and complain about any such + * ambiguously named functions. + */ +#define TEXT_MAIN \ + .text \ + .text.[_0-9A-Za-df-rt-z]* \ + .text.s[_0-9A-Za-su-z]* .text.s .text.s.* \ + .text.st[_0-9A-Zb-z]* .text.st .text.st.* \ + .text.sta[_0-9A-Za-qs-z]* .text.sta .text.sta.* \ + .text.star[_0-9A-Za-su-z]* .text.star .text.star.* \ + .text.start[_0-9A-Za-tv-z]* .text.start .text.start.* \ + .text.startu[_0-9A-Za-oq-z]* .text.startu .text.startu.* \ + .text.startup[_0-9A-Za-z]* \ + .text.e[_0-9A-Za-wy-z]* .text.e .text.e.* \ + .text.ex[_0-9A-Za-hj-z]* .text.ex .text.ex.* \ + .text.exi[_0-9A-Za-su-z]* .text.exi .text.exi.* \ + .text.exit[_0-9A-Za-z]* + +/* + * Support -fdata-sections by matching .data, .data.*, and others, + * but exclude '.data..*'. */ -#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) || \ -defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) -#define TEXT_MAIN .text .text.[0-9a-zA-Z_]* -#else -#define TEXT_MAIN .text -#endif -#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) #define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data.rel.* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]* #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L* #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral* #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]* -#else -#define DATA_MAIN .data .data.rel .data.rel.local -#define SDATA_MAIN .sdata -#define RODATA_MAIN .rodata -#define BSS_MAIN .bss -#define SBSS_MAIN .sbss -#endif /* * GCC 4.5 and later have a 32 bytes section alignment for structures. @@ -581,9 +598,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) * during second ld run in second ld pass when generating System.map * * TEXT_MAIN here will match symbols with a fixed pattern (for example, - * .text.hot or .text.unlikely) if dead code elimination or - * function-section is enabled. Match these symbols first before - * TEXT_MAIN to ensure they are grouped together. + * .text.hot or .text.unlikely). Match those before TEXT_MAIN to ensure + * they get grouped together. * * Also placing .text.hot section at the beginning of a page, this * would help the TLB performance. @@ -729,16 +745,16 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define INIT_TEXT \ *(.init.text .init.text.*) \ - *(.text.startup) + *(.text.startup .text.startup.*) #define EXIT_DATA \ *(.exit.data .exit.data.*) \ *(.fini_array .fini_array.*) \ - *(.dtors .dtors.*) \ + *(.dtors .dtors.*) #define EXIT_TEXT \ *(.exit.text) \ - *(.text.exit) \ + *(.text.exit .text.exit.*) #define EXIT_CALL \ *(.exitcall.exit) @@ -955,7 +971,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define RUNTIME_CONST_VARIABLES \ RUNTIME_CONST(shift, d_hash_shift) \ - RUNTIME_CONST(ptr, dentry_hashtable) + RUNTIME_CONST(ptr, dentry_hashtable) \ + RUNTIME_CONST(ptr, __dentry_cache) /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ diff --git a/include/drm/Makefile b/include/drm/Makefile index 1df6962556ef..48fae3f167c7 100644 --- a/include/drm/Makefile +++ b/include/drm/Makefile @@ -11,7 +11,7 @@ always-$(CONFIG_DRM_HEADER_TEST) += \ quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@) cmd_hdrtest = \ $(CC) $(c_flags) -fsyntax-only -x c /dev/null -include $< -include $<; \ - PYTHONDONTWRITEBYTECODE=1 $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \ + PYTHONDONTWRITEBYTECODE=1 $(PYTHON3) $(KERNELDOC) -none $(if $(CONFIG_WERROR)$(CONFIG_DRM_WERROR),-Werror) $<; \ touch $@ $(obj)/%.hdrtest: $(src)/%.h FORCE diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index da6301a6fcea..69d4ae92d822 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -877,7 +877,10 @@ MACRO__(0xB08F, ## __VA_ARGS__), \ MACRO__(0xB090, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ - MACRO__(0xB0B0, ## __VA_ARGS__), \ + MACRO__(0xB0B0, ## __VA_ARGS__) + +/* WCL */ +#define INTEL_WCL_IDS(MACRO__, ...) \ MACRO__(0xFD80, ## __VA_ARGS__), \ MACRO__(0xFD81, ## __VA_ARGS__) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..607db773b672 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1509,12 +1509,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } +static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + return NULL; +} #endif #ifdef CONFIG_ACPI_LPIT diff --git a/include/linux/annotate.h b/include/linux/annotate.h new file mode 100644 index 000000000000..7c10d34d198c --- /dev/null +++ b/include/linux/annotate.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ANNOTATE_H +#define _LINUX_ANNOTATE_H + +#include <linux/objtool_types.h> + +#ifdef CONFIG_OBJTOOL + +#ifndef __ASSEMBLY__ + +#define __ASM_ANNOTATE(section, label, type) \ + ".pushsection " section ",\"M\", @progbits, 8\n\t" \ + ".long " __stringify(label) " - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define ASM_ANNOTATE_LABEL(label, type) \ + __ASM_ANNOTATE(".discard.annotate_insn", label, type) + +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + ASM_ANNOTATE_LABEL(911b, type) + +#define ASM_ANNOTATE_DATA(type) \ + "912:\n\t" \ + __ASM_ANNOTATE(".discard.annotate_data", 912b, type) + +#else /* __ASSEMBLY__ */ + +.macro __ANNOTATE section, type +.Lhere_\@: + .pushsection \section, "M", @progbits, 8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + +.macro ANNOTATE type + __ANNOTATE ".discard.annotate_insn", \type +.endm + +.macro ANNOTATE_DATA type + __ANNOTATE ".discard.annotate_data", \type +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +#define ASM_ANNOTATE_LABEL(label, type) "" +#define ASM_ANNOTATE(type) +#define ASM_ANNOTATE_DATA(type) +#else /* __ASSEMBLY__ */ +.macro ANNOTATE type +.endm +.macro ANNOTATE_DATA type +.endm +#endif /* __ASSEMBLY__ */ +#endif /* !CONFIG_OBJTOOL */ + +#ifndef __ASSEMBLY__ + +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR)) + +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used to refer to an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI)) + +/* + * Annotate a special section entry. This emables livepatch module generation + * to find and extract individual special section entries as needed. + */ +#define ANNOTATE_DATA_SPECIAL ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL) + +#else /* __ASSEMBLY__ */ +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#define ANNOTATE_DATA_SPECIAL ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL +#endif /* __ASSEMBLY__ */ + +#endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/ata.h b/include/linux/ata.h index 792e10a09787..c9013e472aa3 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -566,6 +566,7 @@ struct ata_bmdma_prd { #define ata_id_has_ncq(id) ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8)) #define ata_id_queue_depth(id) (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1) #define ata_id_removable(id) ((id)[ATA_ID_CONFIG] & (1 << 7)) +#define ata_id_is_locked(id) (((id)[ATA_ID_DLF] & 0x7) == 0x7) #define ata_id_has_atapi_AN(id) \ ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 9409a6ddf3e0..37ab6314a9f7 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } @@ -1298,7 +1298,7 @@ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } @@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } @@ -1343,7 +1343,7 @@ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } @@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } @@ -2876,7 +2876,7 @@ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } @@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } @@ -2921,7 +2921,7 @@ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } @@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } @@ -4454,7 +4454,7 @@ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } @@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } @@ -4499,7 +4499,7 @@ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 8829b337928e9508259079d32581775ececd415b +// f618ac667f868941a84ce0ab2242f1786e049ed4 diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..610ef62b6a32 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,8 @@ enum wb_reason { struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; + unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e64f14739dd..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wbs_work_fn() and + * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; + smp_rmb(); if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 595217b7a6e7..b0395e4ccf90 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ struct device; * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1, } static __always_inline +unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst = *src1 | *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_weighted_or(dst, src1, src2, nbits); + } +} + +static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { diff --git a/include/linux/bug.h b/include/linux/bug.h index a9948a9f1093..17a4933c611b 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -42,6 +42,7 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); +enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs); /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); @@ -62,6 +63,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } struct bug_entry; + +static inline enum bug_trap_type +report_bug_entry(struct bug_entry *bug, struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_BUG; +} + static inline void bug_get_file_line(struct bug_entry *bug, const char **file, unsigned int *line) { diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 7fcec025c5e0..559353ad64ac 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -74,7 +74,7 @@ enum cc_attr { CC_ATTR_GUEST_UNROLL_STRING_IO, /** - * @CC_ATTR_SEV_SNP: Guest SNP is active. + * @CC_ATTR_GUEST_SEV_SNP: Guest SNP is active. * * The platform/OS is running as a guest/virtual machine and actively * using AMD SEV-SNP features. diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 733e7f93db66..63e0e2aa1ce9 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern void ceph_reset_client_addr(struct ceph_client *client); -extern int __ceph_open_session(struct ceph_client *client, - unsigned long started); +extern int __ceph_open_session(struct ceph_client *client); extern int ceph_open_session(struct ceph_client *client); int ceph_wait_for_latest_osdmap(struct ceph_client *client, unsigned long timeout); diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..0b55a8f6c59e 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -208,7 +208,7 @@ */ #define DEFINE_FREE(_name, _type, _free) \ - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } #define __free(_name) __cleanup(__free_##_name) @@ -220,7 +220,7 @@ __val; \ }) -static inline __must_check +static __always_inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } @@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val) * CLASS(name, var)(args...): * declare the variable @var as an instance of the named class * + * CLASS_INIT(name, var, init_expr): + * declare the variable @var as an instance of the named class with + * custom initialization expression. + * * Ex. * * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd) @@ -274,31 +278,35 @@ const volatile void * __must_check_fn(const volatile void *val) #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ typedef _type class_##_name##_t; \ -static inline void class_##_name##_destructor(_type *p) \ +static __always_inline void class_##_name##_destructor(_type *p) \ { _type _T = *p; _exit; } \ -static inline _type class_##_name##_constructor(_init_args) \ +static __always_inline _type class_##_name##_constructor(_init_args) \ { _type t = _init; return t; } #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ typedef class_##_name##_t class_##_name##ext##_t; \ -static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ { class_##_name##_destructor(p); } \ -static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ +static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ { class_##_name##_t t = _init; return t; } #define CLASS(_name, var) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor -#define scoped_class(_name, var, args) \ - for (CLASS(_name, var)(args); \ - __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ - ({ goto _label; })) \ - if (0) { \ -_label: \ - break; \ +#define CLASS_INIT(_name, _var, _init_expr) \ + class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr) + +#define __scoped_class(_name, var, _label, args...) \ + for (CLASS(_name, var)(args); ; ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ } else +#define scoped_class(_name, var, args...) \ + __scoped_class(_name, var, __UNIQUE_ID(label), args) + /* * DEFINE_GUARD(name, type, lock, unlock): * trivial wrapper around DEFINE_CLASS() above specifically @@ -340,6 +348,11 @@ _label: \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond +#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)1; } + #define __GUARD_IS_ERR(_ptr) \ ({ \ unsigned long _rc = (__force unsigned long)(_ptr); \ @@ -347,7 +360,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond }) #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ void *_ptr = (void *)(__force unsigned long)*(_exp); \ if (IS_ERR(_ptr)) { \ @@ -355,7 +368,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond } \ return _ptr; \ } \ - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ { \ long _rc = (__force unsigned long)*(_exp); \ if (!_rc) { \ @@ -384,9 +397,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } /* @@ -466,7 +479,7 @@ typedef struct { \ __VA_ARGS__; \ } class_##_name##_t; \ \ -static inline void class_##_name##_destructor(class_##_name##_t *_T) \ +static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ } \ @@ -474,7 +487,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ _lock; \ @@ -482,7 +495,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ } #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(void) \ +static __always_inline class_##_name##_t class_##_name##_constructor(void) \ { \ class_##_name##_t _t = { .lock = (void*)1 }, \ *_T __maybe_unused = &_t; \ @@ -508,9 +521,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ _t; }), \ typeof_member(class_##_name##_t, lock) l) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..ab181d87d71d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __asm__ ("" : "=r" (var) : "0" (var)) #endif -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) +/* Format: __UNIQUE_ID_<name>_<__COUNTER__> */ +#define __UNIQUE_ID(name) \ + __PASTE(__UNIQUE_ID_, \ + __PASTE(name, \ + __PASTE(_, __COUNTER__))) /** * data_race - mark an expression as containing intentional data races @@ -283,7 +287,7 @@ static inline void *offset_to_ptr(const int *off) */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..0a1b9598940d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -250,10 +250,9 @@ struct ftrace_likely_data { /* * GCC does not warn about unused static inline functions for -Wunused-function. * Suppress the warning in clang as well by using __maybe_unused, but enable it - * for W=1 build. This will allow clang to find unused functions. Remove the - * __inline_maybe_unused entirely after fixing most of -Wunused-function warnings. + * for W=2 build. This will allow clang to find unused functions. */ -#ifdef KBUILD_EXTRA_WARN1 +#ifdef KBUILD_EXTRA_WARN2 #define __inline_maybe_unused #else #define __inline_maybe_unused __maybe_unused @@ -461,6 +460,12 @@ struct ftrace_likely_data { # define __nocfi #endif +#if defined(CONFIG_ARCH_USES_CFI_GENERIC_LLVM_PASS) +# define __nocfi_generic __nocfi +#else +# define __nocfi_generic +#endif + /* * Any place that could be marked with the "alloc_size" attribute is also * a place to be marked with the "malloc" attribute, except those that may diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..afedfd5bea07 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask; #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; +extern unsigned int __num_possible_cpus; extern cpumask_t cpus_booted_once_mask; @@ -729,6 +730,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, } /** + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + * + * Return: The number of bits set in the resulting cpumask @dstp + */ +static __always_inline +unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), small_cpumask_bits); +} + +/** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input @@ -1005,6 +1022,7 @@ static __always_inline unsigned int cpumask_size(void) #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly +#define CPUMASK_VAR_NULL NULL bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -1051,6 +1069,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly +#define CPUMASK_VAR_NULL {} static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { @@ -1136,13 +1155,13 @@ void init_cpu_possible(const struct cpumask *src); #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_possible(unsigned int cpu, bool possible); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * @@ -1195,7 +1214,12 @@ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } -#define num_possible_cpus() cpumask_weight(cpu_possible_mask) + +static __always_inline unsigned int num_possible_cpus(void) +{ + return __num_possible_cpus; +} + #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) diff --git a/include/linux/cred.h b/include/linux/cred.h index 89ae50ad2ace..343a140a6ba2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -20,6 +20,8 @@ struct cred; struct inode; +extern struct task_struct init_task; + /* * COW Supplementary groups list */ @@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); +static inline const struct cred *kernel_cred(void) +{ + /* shut up sparse */ + return rcu_dereference_raw(init_task.cred); +} extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); @@ -180,6 +187,16 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return rcu_replace_pointer(current->cred, revert_cred, 1); } +DEFINE_CLASS(override_creds, + const struct cred *, + revert_creds(_T), + override_creds(override_cred), const struct cred *override_cred) + +#define scoped_with_creds(cred) \ + scoped_class(override_creds, __UNIQUE_ID(label), cred) + +#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred()) + /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference @@ -263,6 +280,11 @@ static inline void put_cred(const struct cred *cred) put_cred_many(cred, 1); } +DEFINE_CLASS(prepare_creds, + struct cred *, + if (_T) put_cred(_T), + prepare_creds(), void) + DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** diff --git a/include/linux/delay.h b/include/linux/delay.h index 89866bab100d..46412c00033a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max, * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ @@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent - * contribution to the load avarage. + * contribution to the load average. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { @@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max) * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * - * Please refere to msleep() for detailed information. + * Please refer to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 8248ff9363ee..2ceda49c609f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -90,7 +90,7 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) +#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0) struct dma_iova_state { dma_addr_t addr; diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 69b136e4dd2b..bb3dcded055f 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -60,23 +60,21 @@ #else /* !__ASSEMBLER__ */ #include <uapi/linux/elf.h> +#include <linux/compiler.h> /* * Use an anonymous structure which matches the shape of * Elf{32,64}_Nhdr, but includes the name and desc data. The size and * type of name and desc depend on the macro arguments. "name" must - * be a literal string, and "desc" must be passed by value. You may - * only define one note per line, since __LINE__ is used to generate - * unique symbols. + * be a literal string, and "desc" must be passed by value. */ -#define _ELFNOTE_PASTE(a,b) a##b -#define _ELFNOTE(size, name, unique, type, desc) \ +#define ELFNOTE(size, name, type, desc) \ static const struct { \ struct elf##size##_note _nhdr; \ unsigned char _name[sizeof(name)] \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ typeof(desc) _desc \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ - } _ELFNOTE_PASTE(_note_, unique) \ + } __UNIQUE_ID(note) \ __used \ __attribute__((section(".note." name), \ aligned(sizeof(Elf##size##_Word)), \ @@ -89,11 +87,10 @@ name, \ desc \ } -#define ELFNOTE(size, name, type, desc) \ - _ELFNOTE(size, name, __LINE__, type, desc) #define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) #define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) + #endif /* __ASSEMBLER__ */ #endif /* _LINUX_ELFNOTE_H */ diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 7177436f0f9e..87efb38b7081 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -3,11 +3,11 @@ #define __LINUX_ENTRYCOMMON_H #include <linux/irq-entry-common.h> +#include <linux/livepatch.h> #include <linux/ptrace.h> +#include <linux/resume_user_mode.h> #include <linux/seccomp.h> #include <linux/sched.h> -#include <linux/livepatch.h> -#include <linux/resume_user_mode.h> #include <asm/entry-common.h> #include <asm/syscall.h> @@ -37,6 +37,7 @@ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) + #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -44,25 +45,7 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -/** - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts - * @regs: Pointer to currents pt_regs - * - * Invoked from architecture specific syscall entry code with interrupts - * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct, interrupts are enabled and the - * subsequent functions can be instrumented. - * - * This handles lockdep, RCU (context tracking) and tracing state, i.e. - * the functionality provided by enter_from_user_mode(). - * - * This is invoked when there is extra architecture specific functionality - * to be done between establishing state and handling user mode entry work. - */ -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); - -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra - * architecture specific work. + * enabled after invoking enter_from_user_mode(), enabling interrupts and + * extra architecture specific work. * * Returns: The original or a modified syscall number * @@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This is combination of syscall_enter_from_user_mode_prepare() and - * syscall_enter_from_user_mode_work(). + * This is the combination of enter_from_user_mode() and + * syscall_enter_from_user_mode_work() to be used when there is no + * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. @@ -162,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) local_irq_enable(); } - rseq_syscall(regs); + rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are @@ -172,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); + syscall_exit_to_user_mode_prepare(regs); } /** diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h index 42c89e3e5ca7..bfa767702d9a 100644 --- a/include/linux/entry-virt.h +++ b/include/linux/entry-virt.h @@ -32,7 +32,7 @@ */ static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); -#ifndef arch_xfer_to_guest_mode_work +#ifndef arch_xfer_to_guest_mode_handle_work static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) { return 0; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..5c9162193d26 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,7 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 -/** +/* * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for * the end-of-list marker, total 17 items */ diff --git a/include/linux/file.h b/include/linux/file.h index af1768d934a0..cf389fde9bc2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -127,4 +127,130 @@ extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; +/* + * fd_prepare: Combined fd + file allocation cleanup class. + * @err: Error code to indicate if allocation succeeded. + * @__fd: Allocated fd (may not be accessed directly) + * @__file: Allocated struct file pointer (may not be accessed directly) + * + * Allocates an fd and a file together. On error paths, automatically cleans + * up whichever resource was successfully allocated. Allows flexible file + * allocation with different functions per usage. + * + * Do not use directly. + */ +struct fd_prepare { + s32 err; + s32 __fd; /* do not access directly */ + struct file *__file; /* do not access directly */ +}; + +/* Typedef for fd_prepare cleanup guards. */ +typedef struct fd_prepare class_fd_prepare_t; + +/* + * Accessors for fd_prepare class members. + * _Generic() is used for zero-cost type safety. + */ +#define fd_prepare_fd(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__fd)) + +#define fd_prepare_file(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__file)) + +/* Do not use directly. */ +static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) { + if (likely(fdf->__fd >= 0)) + put_unused_fd(fdf->__fd); + if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) + fput(fdf->__file); + } +} + +/* Do not use directly. */ +static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) + return fdf->err; + if (unlikely(fdf->__fd < 0)) + return fdf->__fd; + if (unlikely(IS_ERR(fdf->__file))) + return PTR_ERR(fdf->__file); + if (unlikely(!fdf->__file)) + return -ENOMEM; + return 0; +} + +/* + * __FD_PREPARE_INIT - Helper to initialize fd_prepare class. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: expression that returns struct file * + * + * Returns a struct fd_prepare with fd, file, and err set. + * If fd allocation fails, fd will be negative and err will be set. If + * fd succeeds but file_init_expr fails, file will be ERR_PTR and err + * will be set. The err field is the single source of truth for error + * checking. + */ +#define __FD_PREPARE_INIT(_fd_flags, _file_owned) \ + ({ \ + struct fd_prepare fdf = { \ + .__fd = get_unused_fd_flags((_fd_flags)), \ + }; \ + if (likely(fdf.__fd >= 0)) \ + fdf.__file = (_file_owned); \ + fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \ + fdf; \ + }) + +/* + * FD_PREPARE - Macro to declare and initialize an fd_prepare variable. + * + * Declares and initializes an fd_prepare variable with automatic + * cleanup. No separate scope required - cleanup happens when variable + * goes out of scope. + * + * @_fdf: name of struct fd_prepare variable to define + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of (can be expression) + */ +#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \ + CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned)) + +/* + * fd_publish - Publish prepared fd and file to the fd table. + * @_fdf: struct fd_prepare variable + */ +#define fd_publish(_fdf) \ + ({ \ + struct fd_prepare *fdp = &(_fdf); \ + VFS_WARN_ON_ONCE(fdp->err); \ + VFS_WARN_ON_ONCE(fdp->__fd < 0); \ + VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \ + fd_install(fdp->__fd, fdp->__file); \ + fdp->__fd; \ + }) + +/* Do not use directly. */ +#define __FD_ADD(_fdf, _fd_flags, _file_owned) \ + ({ \ + FD_PREPARE(_fdf, _fd_flags, _file_owned); \ + s32 ret = _fdf.err; \ + if (likely(!ret)) \ + ret = fd_publish(_fdf); \ + ret; \ + }) + +/* + * FD_ADD - Allocate and install an fd and file in one step. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of + * + * Returns the allocated fd number, or negative error code on failure. + */ +#define FD_ADD(_fd_flags, _file_owned) \ + __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned) + #endif /* __LINUX_FILE_H */ diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c2ce8ba05d06..54b824c05299 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg); +int fcntl_getdeleg(struct file *filp, struct delegation *deleg); static inline bool lock_is_unlock(struct file_lock *fl) { @@ -212,7 +214,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); -int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); + +#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations +#define LEASE_BREAK_DELEG BIT(1) // break delegations only +#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only +#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break +#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event + +int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); @@ -271,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp) return F_UNLCK; } +static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + +static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + static inline bool lock_is_unlock(struct file_lock *fl) { return false; @@ -367,7 +386,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f return -ENOLCK; } -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +static inline int __break_lease(struct inode *inode, unsigned int flags) { return 0; } @@ -428,6 +447,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) } #ifdef CONFIG_FILE_LOCKING +static inline unsigned int openmode_to_lease_flags(unsigned int mode) +{ + unsigned int flags = 0; + + if ((mode & O_ACCMODE) == O_RDONLY) + flags |= LEASE_BREAK_OPEN_RDONLY; + if (mode & O_NONBLOCK) + flags |= LEASE_BREAK_NONBLOCK; + return flags; +} + static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; @@ -443,11 +473,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); + return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode)); return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { struct file_lock_context *flctx; @@ -461,60 +491,84 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) if (!flctx) return 0; smp_mb(); - if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); + if (!list_empty_careful(&flctx->flc_lease)) { + flags |= LEASE_BREAK_DELEG; + return __break_lease(inode, flags); + } return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +struct delegated_inode { + struct inode *di_inode; +}; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return di->di_inode; +} + +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *di) { int ret; - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; + ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); + if (ret == -EWOULDBLOCK && di) { + di->di_inode = inode; ihold(inode); } return ret; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *di) { int ret; - ret = break_deleg(*delegated_inode, O_WRONLY); - iput(*delegated_inode); - *delegated_inode = NULL; + ret = break_deleg(di->di_inode, 0); + iput(di->di_inode); + di->di_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + unsigned int flags = LEASE_BREAK_LAYOUT; + + if (!wait) + flags |= LEASE_BREAK_NONBLOCK; + + return __break_lease(inode, flags); + } return 0; } #else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) +struct delegated_inode { }; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return false; +} + +static inline int break_lease(struct inode *inode, bool wait) { return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *delegated_inode) { return 0; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *delegated_inode) { BUG(); return 0; diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a..973233b82dc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -901,6 +901,26 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb) cb->data_end = skb->data + skb_headlen(skb); } +static inline int bpf_prog_run_data_pointers( + const struct bpf_prog *prog, + struct sk_buff *skb) +{ + struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; + void *save_data_meta, *save_data_end; + int res; + + save_data_meta = cb->data_meta; + save_data_end = cb->data_end; + + bpf_compute_data_pointers(skb); + res = bpf_prog_run(prog, skb); + + cb->data_meta = save_data_meta; + cb->data_end = save_data_end; + + return res; +} + /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ce25feb06727 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include <linux/fs/super.h> #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> @@ -11,7 +12,6 @@ #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> -#include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> @@ -37,7 +37,6 @@ #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> -#include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> @@ -52,11 +51,9 @@ #include <asm/byteorder.h> #include <uapi/linux/fs.h> -struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; -struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; @@ -70,16 +67,13 @@ struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; -struct workqueue_struct; struct iov_iter; -struct fscrypt_operations; -struct fsverity_operations; struct fsnotify_mark_connector; -struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct file_kattr; struct iomap_ops; +struct delegated_inode; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -299,11 +293,6 @@ struct iattr { }; /* - * Includes for diskquotas. - */ -#include <linux/quota.h> - -/* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ @@ -367,23 +356,9 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) -/* - * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the - * iocb completion can be passed back to the owner for execution from a safe - * context rather than needing to be punted through a workqueue. If this - * flag is set, the bio completion handling may set iocb->dio_complete to a - * handler function and iocb->private to context information for that handler. - * The issuer should call the handler with that context information from task - * context to complete the processing of the iocb. Note that while this - * provides a task context for the dio_complete() callback, it should only be - * used on the completion side for non-IO generating completions. It's fine to - * call blocking functions from this callback, but they should not wait for - * unrelated IO (like cache flushing, new IO generation, etc). - */ -#define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ -#define IOCB_AIO_RW (1 << 23) -#define IOCB_HAS_METADATA (1 << 24) +#define IOCB_AIO_RW (1 << 22) +#define IOCB_HAS_METADATA (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ @@ -400,7 +375,6 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } @@ -412,23 +386,13 @@ struct kiocb { int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; - union { - /* - * Only used for async buffered reads, where it denotes the - * page waitqueue associated with completing the read. Valid - * IFF IOCB_WAITQ is set. - */ - struct wait_page_queue *ki_waitq; - /* - * Can be used for O_DIRECT IO, where the completion handling - * is punted back to the issuer of the IO. May only be set - * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer - * must then check for presence of this handler when ki_complete - * is invoked. The data passed in to this handler must be - * assigned to ->private when dio_complete is assigned. - */ - ssize_t (*dio_complete)(void *data); - }; + + /* + * Only used for async buffered reads, where it denotes the page + * waitqueue associated with completing the read. + * Valid IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -659,13 +623,14 @@ is_uncached_acl(struct posix_acl *acl) return (long)acl & 1; } -#define IOP_FASTPERM 0x0001 -#define IOP_LOOKUP 0x0002 -#define IOP_NOFOLLOW 0x0004 -#define IOP_XATTR 0x0008 +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 +#define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -#define IOP_MGTIME 0x0020 -#define IOP_CACHED_LINK 0x0040 +#define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 +#define IOP_FASTPERM_MAY_EXEC 0x0080 /* * Inode state bits. Protected by inode->i_lock @@ -759,7 +724,7 @@ enum inode_state_bits { /* reserved wait address bit 3 */ }; -enum inode_state_flags_t { +enum inode_state_flags_enum { I_NEW = (1U << __I_NEW), I_SYNC = (1U << __I_SYNC), I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), @@ -786,6 +751,13 @@ enum inode_state_flags_t { #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) /* + * Use inode_state_read() & friends to access. + */ +struct inode_state_flags { + enum inode_state_flags_enum __state; +}; + +/* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' @@ -793,14 +765,13 @@ enum inode_state_flags_t { struct inode { umode_t i_mode; unsigned short i_opflags; - kuid_t i_uid; - kgid_t i_gid; unsigned int i_flags; - #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif + kuid_t i_uid; + kgid_t i_gid; const struct inode_operations *i_op; struct super_block *i_sb; @@ -843,7 +814,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_t i_state; + struct inode_state_flags i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -902,6 +873,80 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * i_state handling + * + * We hide all of it behind helpers so that we can validate consumers. + */ +static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) +{ + return READ_ONCE(inode->i_state.__state); +} + +static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + return inode->i_state.__state; +} + +static inline void inode_state_set_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags); +} + +static inline void inode_state_set(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_set_raw(inode, flags); +} + +static inline void inode_state_clear_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags); +} + +static inline void inode_state_clear(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_clear_raw(inode, flags); +} + +static inline void inode_state_assign_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state.__state, flags); +} + +static inline void inode_state_assign(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace_raw(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + enum inode_state_flags_enum flags; + flags = inode->i_state.__state; + flags &= ~clearflags; + flags |= setflags; + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_replace_raw(inode, clearflags, setflags); +} + static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); @@ -949,6 +994,8 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } +void wait_on_new_inode(struct inode *inode); + /* * inode->i_rwsem nesting subclasses for the lock validator: * @@ -1348,49 +1395,6 @@ extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* - * sb->s_flags. Note that these mirror the equivalent MS_* flags where - * represented in both. - */ -#define SB_RDONLY BIT(0) /* Mount read-only */ -#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ -#define SB_NODEV BIT(2) /* Disallow access to device special files */ -#define SB_NOEXEC BIT(3) /* Disallow program execution */ -#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ -#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ -#define SB_NOATIME BIT(10) /* Do not update access times. */ -#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ -#define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ -#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ -#define SB_I_VERSION BIT(23) /* Update inode I_version field */ -#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define SB_DEAD BIT(21) -#define SB_DYING BIT(24) -#define SB_FORCE BIT(27) -#define SB_NOSEC BIT(28) -#define SB_BORN BIT(29) -#define SB_ACTIVE BIT(30) -#define SB_NOUSER BIT(31) - -/* These flags relate to encoding and casefolding */ -#define SB_ENC_STRICT_MODE_FL (1 << 0) -#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) - -#define sb_has_strict_encoding(sb) \ - (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) - -#if IS_ENABLED(CONFIG_UNICODE) -#define sb_no_casefold_compat_fallback(sb) \ - (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) -#else -#define sb_no_casefold_compat_fallback(sb) (1) -#endif - -/* * Umount options */ @@ -1400,191 +1404,6 @@ extern int send_sigurg(struct file *file); #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ -/* sb->s_iflags */ -#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ -#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ -#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ - -/* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 -#define SB_I_UNTRUSTED_MOUNTER 0x00000040 -#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 - -#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ -#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ -#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ -#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ -#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ -#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ -#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ - -/* Possible states of 'frozen' field */ -enum { - SB_UNFROZEN = 0, /* FS is unfrozen */ - SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ - SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ - SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop - * internal threads if needed) */ - SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ -}; - -#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) - -struct sb_writers { - unsigned short frozen; /* Is sb frozen? */ - int freeze_kcount; /* How many kernel freeze requests? */ - int freeze_ucount; /* How many userspace freeze requests? */ - const void *freeze_owner; /* Owner of the freeze */ - struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; -}; - -struct mount; - -struct super_block { - struct list_head s_list; /* Keep this first */ - dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_blocksize_bits; - unsigned long s_blocksize; - loff_t s_maxbytes; /* Max file size */ - struct file_system_type *s_type; - const struct super_operations *s_op; - const struct dquot_operations *dq_op; - const struct quotactl_ops *s_qcop; - const struct export_operations *s_export_op; - unsigned long s_flags; - unsigned long s_iflags; /* internal SB_I_* flags */ - unsigned long s_magic; - struct dentry *s_root; - struct rw_semaphore s_umount; - int s_count; - atomic_t s_active; -#ifdef CONFIG_SECURITY - void *s_security; -#endif - const struct xattr_handler * const *s_xattr; -#ifdef CONFIG_FS_ENCRYPTION - const struct fscrypt_operations *s_cop; - struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ -#endif -#ifdef CONFIG_FS_VERITY - const struct fsverity_operations *s_vop; -#endif -#if IS_ENABLED(CONFIG_UNICODE) - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct mount *s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ - struct file *s_bdev_file; - struct backing_dev_info *s_bdi; - struct mtd_info *s_mtd; - struct hlist_node s_instances; - unsigned int s_quota_types; /* Bitmask of supported quota types */ - struct quota_info s_dquot; /* Diskquota specific options */ - - struct sb_writers s_writers; - - /* - * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_info together for cache efficiency. They are frequently - * accessed and rarely modified. - */ - void *s_fs_info; /* Filesystem private info */ - - /* Granularity of c/m/atime in ns (cannot be worse than a second) */ - u32 s_time_gran; - /* Time limits for c/m/atime in seconds */ - time64_t s_time_min; - time64_t s_time_max; -#ifdef CONFIG_FSNOTIFY - u32 s_fsnotify_mask; - struct fsnotify_sb_info *s_fsnotify_info; -#endif - - /* - * q: why are s_id and s_sysfs_name not the same? both are human - * readable strings that identify the filesystem - * a: s_id is allowed to change at runtime; it's used in log messages, - * and we want to when a device starts out as single device (s_id is dev - * name) but then a device is hot added and we have to switch to - * identifying it by UUID - * but s_sysfs_name is a handle for programmatic access, and can't - * change at runtime - */ - char s_id[32]; /* Informational name */ - uuid_t s_uuid; /* UUID */ - u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ - - /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ - char s_sysfs_name[UUID_STRING_LEN + 1]; - - unsigned int s_max_links; - unsigned int s_d_flags; /* default d_flags for dentries */ - - /* - * The next field is for VFS *only*. No filesystems have any business - * even looking at it. You had been warned. - */ - struct mutex s_vfs_rename_mutex; /* Kludge */ - - /* - * Filesystem subtype. If non-empty the filesystem type field - * in /proc/mounts will be "type.subtype" - */ - const char *s_subtype; - - const struct dentry_operations *__s_d_op; /* default d_op for dentries */ - - struct shrinker *s_shrink; /* per-sb shrinker handle */ - - /* Number of inodes with nlink == 0 but still referenced */ - atomic_long_t s_remove_count; - - /* Read-only state of the superblock is being changed */ - int s_readonly_remount; - - /* per-sb errseq_t for reporting writeback errors via syncfs */ - errseq_t s_wb_err; - - /* AIO completions deferred from interrupt context */ - struct workqueue_struct *s_dio_done_wq; - struct hlist_head s_pins; - - /* - * Owning user namespace and default context in which to - * interpret filesystem uids, gids, quotas, device nodes, - * xattrs and security labels. - */ - struct user_namespace *s_user_ns; - - /* - * The list_lru structure is essentially just a pointer to a table - * of per-node lru lists, each of which has its own spinlock. - * There is no need to put them into separate cachelines. - */ - struct list_lru s_dentry_lru; - struct list_lru s_inode_lru; - struct rcu_head rcu; - struct work_struct destroy_work; - - struct mutex s_sync_lock; /* sync serialisation lock */ - - /* - * Indicates how deep in a filesystem stack this SB is - */ - int s_stack_depth; - - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ - - spinlock_t s_inode_wblist_lock; - struct list_head s_inodes_wb; /* writeback inodes */ -} __randomize_layout; - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; @@ -1902,66 +1721,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode); * Snapshotting support. */ -/* - * These are internal functions, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -static inline void __sb_end_write(struct super_block *sb, int level) -{ - percpu_up_read(sb->s_writers.rw_sem + level-1); -} - -static inline void __sb_start_write(struct super_block *sb, int level) -{ - percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); -} - -static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -{ - return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); -} - -#define __sb_writers_acquired(sb, lev) \ - percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -#define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) - -/** - * __sb_write_started - check if sb freeze level is held - * @sb: the super we write to - * @level: the freeze level - * - * * > 0 - sb freeze level is held - * * 0 - sb freeze level is not held - * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN - */ -static inline int __sb_write_started(const struct super_block *sb, int level) -{ - return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); -} - -/** - * sb_write_started - check if SB_FREEZE_WRITE is held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE); -} - -/** - * sb_write_not_started - check if SB_FREEZE_WRITE is not held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_not_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; -} - /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1992,137 +1751,26 @@ static inline bool file_write_not_started(const struct file *file) return sb_write_not_started(file_inode(file)->i_sb); } -/** - * sb_end_write - drop write access to a superblock - * @sb: the super we wrote to - * - * Decrement number of writers to the filesystem. Wake up possible waiters - * wanting to freeze the filesystem. - */ -static inline void sb_end_write(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_WRITE); -} - -/** - * sb_end_pagefault - drop write access to a superblock from a page fault - * @sb: the super we wrote to - * - * Decrement number of processes handling write page fault to the filesystem. - * Wake up possible waiters wanting to freeze the filesystem. - */ -static inline void sb_end_pagefault(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_end_intwrite - drop write access to a superblock for internal fs purposes - * @sb: the super we wrote to - * - * Decrement fs-internal number of writers to the filesystem. Wake up possible - * waiters wanting to freeze the filesystem. - */ -static inline void sb_end_intwrite(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_FS); -} - -/** - * sb_start_write - get write access to a superblock - * @sb: the super we write to - * - * When a process wants to write data or metadata to a file system (i.e. dirty - * a page or an inode), it should embed the operation in a sb_start_write() - - * sb_end_write() pair to get exclusion against file system freezing. This - * function increments number of writers preventing freezing. If the file - * system is already frozen, the function waits until the file system is - * thawed. - * - * Since freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. Generally, - * freeze protection should be the outermost lock. In particular, we have: - * - * sb_start_write - * -> i_rwsem (write path, truncate, directory ops, ...) - * -> s_umount (freeze_super, thaw_super) - */ -static inline void sb_start_write(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_WRITE); -} - -static inline bool sb_start_write_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -} - -/** - * sb_start_pagefault - get write access to a superblock from a page fault - * @sb: the super we write to - * - * When a process starts handling write page fault, it should embed the - * operation into sb_start_pagefault() - sb_end_pagefault() pair to get - * exclusion against file system freezing. This is needed since the page fault - * is going to dirty a page. This function increments number of running page - * faults preventing freezing. If the file system is already frozen, the - * function waits until the file system is thawed. - * - * Since page fault freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. It is advised to - * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault - * handling code implies lock dependency: - * - * mmap_lock - * -> sb_start_pagefault - */ -static inline void sb_start_pagefault(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_start_intwrite - get write access to a superblock for internal fs purposes - * @sb: the super we write to - * - * This is the third level of protection against filesystem freezing. It is - * free for use by a filesystem. The only requirement is that it must rank - * below sb_start_pagefault. - * - * For example filesystem can call sb_start_intwrite() when starting a - * transaction which somewhat eases handling of freezing for internal sources - * of filesystem changes (internal fs threads, discarding preallocation on file - * close, etc.). - */ -static inline void sb_start_intwrite(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_FS); -} - -static inline bool sb_start_intwrite_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_FS); -} - bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t, bool); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, + struct delegated_inode *); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); + struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t, dev_t); + umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, - struct dentry *, const char *); + struct dentry *, const char *, struct delegated_inode *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, - struct dentry *, struct inode **); -int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); + struct dentry *, struct delegated_inode *); +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, + struct delegated_inode *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, - struct inode **); + struct delegated_inode *); /** * struct renamedata - contains all information required for renaming @@ -2140,7 +1788,7 @@ struct renamedata { struct dentry *old_dentry; struct dentry *new_parent; struct dentry *new_dentry; - struct inode **delegated_inode; + struct delegated_inode *delegated_inode; unsigned int flags; } __randomize_layout; @@ -2150,7 +1798,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, - WHITEOUT_DEV); + WHITEOUT_DEV, NULL); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, @@ -2431,72 +2079,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); -/** - * enum freeze_holder - holder of the freeze - * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem - * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem - * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed - * @FREEZE_EXCL: a freeze that can only be undone by the owner - * - * Indicate who the owner of the freeze or thaw request is and whether - * the freeze needs to be exclusive or can nest. - * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the - * same holder aren't allowed. It is however allowed to hold a single - * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at - * the same time. This is relied upon by some filesystems during online - * repair or similar. - */ -enum freeze_holder { - FREEZE_HOLDER_KERNEL = (1U << 0), - FREEZE_HOLDER_USERSPACE = (1U << 1), - FREEZE_MAY_NEST = (1U << 2), - FREEZE_EXCL = (1U << 3), -}; - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - void (*free_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, struct writeback_control *wbc); - int (*drop_inode) (struct inode *); - void (*evict_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - int (*show_devname)(struct seq_file *, struct dentry *); - int (*show_path)(struct seq_file *, struct dentry *); - int (*show_stats)(struct seq_file *, struct dentry *); -#ifdef CONFIG_QUOTA - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot __rcu **(*get_dquots)(struct inode *); -#endif - long (*nr_cached_objects)(struct super_block *, - struct shrink_control *); - long (*free_cached_objects)(struct super_block *, - struct shrink_control *); - /* - * If a filesystem can support graceful removal of a device and - * continue read-write operations, implement this callback. - * - * Return 0 if the filesystem can continue read-write. - * Non-zero return value or no such callback means the fs will be shutdown - * as usual. - */ - int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); - void (*shutdown)(struct super_block *sb); -}; - /* * Inode flags - they have no relation to superblock flags now */ @@ -2539,7 +2121,6 @@ struct super_operations { */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) -static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) @@ -2635,8 +2216,8 @@ static inline int icount_read(const struct inode *inode) */ static inline bool inode_is_dirtytime_only(struct inode *inode) { - return (inode->i_state & (I_DIRTY_TIME | I_NEW | - I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; + return (inode_state_read_once(inode) & + (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); @@ -2689,6 +2270,7 @@ struct file_system_type { #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ +#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2773,10 +2355,6 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -int freeze_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); -int thaw_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -2819,10 +2397,9 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch va_end(args); } -extern int current_umask(void); - extern void ihold(struct inode * inode); extern void iput(struct inode *); +void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); @@ -2963,12 +2540,6 @@ extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) -extern struct super_block *blockdev_superblock; -static inline bool sb_is_blkdev_sb(struct super_block *sb) -{ - return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; -} - void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; @@ -3014,7 +2585,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) @@ -3051,8 +2622,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_flush_range(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; @@ -3071,7 +2642,7 @@ static inline int bmap(struct inode *inode, sector_t *block) #endif int notify_change(struct mnt_idmap *, struct dentry *, - struct iattr *, struct inode **); + struct iattr *, struct delegated_inode *); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) @@ -3101,7 +2672,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * - * This is a variant of sb_start_write() which is a noop on non-regualr file. + * This is a variant of sb_start_write() which is a noop on non-regular file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) @@ -3269,6 +2840,7 @@ extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); +u64 vfsmount_to_propagation_flags(struct vfsmount *mnt); extern char *file_path(struct file *, char *, int); @@ -3326,7 +2898,7 @@ extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), - void *data); + void *data, bool *isnew); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); @@ -3378,11 +2950,9 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -/* - * inode->i_lock must be held - */ static inline void __iget(struct inode *inode) { + lockdep_assert_held(&inode->i_lock); atomic_inc(&inode->i_count); } @@ -3421,10 +2991,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); -extern void inode_add_lru(struct inode *inode); - -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); +extern void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); @@ -3606,9 +3173,11 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -void filesystems_freeze(void); +void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); +void end_dirop(struct dentry *de); + extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); @@ -3745,38 +3314,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, } #endif -static inline struct unicode_map *sb_encoding(const struct super_block *sb) -{ -#if IS_ENABLED(CONFIG_UNICODE) - return sb->s_encoding; -#else - return NULL; -#endif -} - -static inline bool sb_has_encoding(const struct super_block *sb) -{ - return !!sb_encoding(sb); -} - -/* - * Compare if two super blocks have the same encoding and flags - */ -static inline bool sb_same_encoding(const struct super_block *sb1, - const struct super_block *sb2) -{ -#if IS_ENABLED(CONFIG_UNICODE) - if (sb1->s_encoding == sb2->s_encoding) - return true; - - return (sb1->s_encoding && sb2->s_encoding && - (sb1->s_encoding->version == sb2->s_encoding->version) && - (sb1->s_encoding_flags == sb2->s_encoding_flags)); -#else - return true; -#endif -} - int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h new file mode 100644 index 000000000000..f21ffbb6dea5 --- /dev/null +++ b/include/linux/fs/super.h @@ -0,0 +1,238 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_H +#define _LINUX_FS_SUPER_H + +#include <linux/fs/super_types.h> +#include <linux/unicode.h> + +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level - 1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} + +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_) + +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_rwsem (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE); +} + +DEFINE_GUARD(super_write, + struct super_block *, + sb_start_write(_T), + sb_end_write(_T)) + +static inline bool sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_lock + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS); +} + +static inline bool sb_start_intwrite_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_FS); +} + +static inline bool sb_rdonly(const struct super_block *sb) +{ + return sb->s_flags & SB_RDONLY; +} + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; +} + +#if IS_ENABLED(CONFIG_UNICODE) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return sb->s_encoding; +} + +/* Compare if two super blocks have the same encoding and flags */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +} +#else +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return NULL; +} + +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + return true; +} +#endif + +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); + +int freeze_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); +int thaw_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); + +#endif /* _LINUX_FS_SUPER_H */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h new file mode 100644 index 000000000000..6bd3009e09b3 --- /dev/null +++ b/include/linux/fs/super_types.h @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_TYPES_H +#define _LINUX_FS_SUPER_TYPES_H + +#include <linux/fs_dirent.h> +#include <linux/errseq.h> +#include <linux/list_lru.h> +#include <linux/list.h> +#include <linux/list_bl.h> +#include <linux/llist.h> +#include <linux/uidgid.h> +#include <linux/uuid.h> +#include <linux/percpu-rwsem.h> +#include <linux/workqueue_types.h> +#include <linux/quota.h> + +struct backing_dev_info; +struct block_device; +struct dentry; +struct dentry_operations; +struct dquot_operations; +struct export_operations; +struct file; +struct file_system_type; +struct fscrypt_operations; +struct fsnotify_sb_info; +struct fsverity_operations; +struct kstatfs; +struct mount; +struct mtd_info; +struct quotactl_ops; +struct shrinker; +struct unicode_map; +struct user_namespace; +struct workqueue_struct; +struct writeback_control; +struct xattr_handler; + +extern struct super_block *blockdev_superblock; + +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + unsigned short frozen; /* Is sb frozen? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ + const void *freeze_owner; /* Owner of the freeze */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; +}; + +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * @FREEZE_EXCL: a freeze that can only be undone by the owner + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), + FREEZE_EXCL = (1U << 3), +}; + +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *inode); + void (*free_inode)(struct inode *inode); + void (*dirty_inode)(struct inode *inode, int flags); + int (*write_inode)(struct inode *inode, struct writeback_control *wbc); + int (*drop_inode)(struct inode *inode); + void (*evict_inode)(struct inode *inode); + void (*put_super)(struct super_block *sb); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*freeze_fs)(struct super_block *sb); + int (*thaw_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*unfreeze_fs)(struct super_block *sb); + int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs); + int (*remount_fs) (struct super_block *, int *, char *); + void (*umount_begin)(struct super_block *sb); + + int (*show_options)(struct seq_file *seq, struct dentry *dentry); + int (*show_devname)(struct seq_file *seq, struct dentry *dentry); + int (*show_path)(struct seq_file *seq, struct dentry *dentry); + int (*show_stats)(struct seq_file *seq, struct dentry *dentry); +#ifdef CONFIG_QUOTA + ssize_t (*quota_read)(struct super_block *sb, int type, char *data, + size_t len, loff_t off); + ssize_t (*quota_write)(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + struct dquot __rcu **(*get_dquots)(struct inode *inode); +#endif + long (*nr_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + long (*free_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + /* + * If a filesystem can support graceful removal of a device and + * continue read-write operations, implement this callback. + * + * Return 0 if the filesystem can continue read-write. + * Non-zero return value or no such callback means the fs will be shutdown + * as usual. + */ + int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); + void (*shutdown)(struct super_block *sb); +}; + +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned char s_blocksize_bits; + unsigned long s_blocksize; + loff_t s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + const struct super_operations *s_op; + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + int s_count; + atomic_t s_active; +#ifdef CONFIG_SECURITY + void *s_security; +#endif + const struct xattr_handler *const *s_xattr; +#ifdef CONFIG_FS_ENCRYPTION + const struct fscrypt_operations *s_cop; + struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ +#endif +#ifdef CONFIG_FS_VERITY + const struct fsverity_operations *s_vop; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; + struct backing_dev_info *s_bdi; + struct mtd_info *s_mtd; + struct hlist_node s_instances; + unsigned int s_quota_types; /* Bitmask of supported quota types */ + struct quota_info s_dquot; /* Diskquota specific options */ + + struct sb_writers s_writers; + + /* + * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and + * s_fsnotify_info together for cache efficiency. They are frequently + * accessed and rarely modified. + */ + void *s_fs_info; /* Filesystem private info */ + + /* Granularity of c/m/atime in ns (cannot be worse than a second) */ + u32 s_time_gran; + /* Time limits for c/m/atime in seconds */ + time64_t s_time_min; + time64_t s_time_max; +#ifdef CONFIG_FSNOTIFY + u32 s_fsnotify_mask; + struct fsnotify_sb_info *s_fsnotify_info; +#endif + + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ + char s_id[32]; /* Informational name */ + uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + + unsigned int s_max_links; + unsigned int s_d_flags; /* default d_flags for dentries */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct mutex s_vfs_rename_mutex; /* Kludge */ + + /* + * Filesystem subtype. If non-empty the filesystem type field + * in /proc/mounts will be "type.subtype" + */ + const char *s_subtype; + + const struct dentry_operations *__s_d_op; /* default d_op for dentries */ + + struct shrinker *s_shrink; /* per-sb shrinker handle */ + + /* Number of inodes with nlink == 0 but still referenced */ + atomic_long_t s_remove_count; + + /* Read-only state of the superblock is being changed */ + int s_readonly_remount; + + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + + /* AIO completions deferred from interrupt context */ + struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; + + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + + /* + * The list_lru structure is essentially just a pointer to a table + * of per-node lru lists, each of which has its own spinlock. + * There is no need to put them into separate cachelines. + */ + struct list_lru s_dentry_lru; + struct list_lru s_inode_lru; + struct rcu_head rcu; + struct work_struct destroy_work; + + struct mutex s_sync_lock; /* sync serialisation lock */ + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ + + spinlock_t s_inode_wblist_lock; + struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; +} __randomize_layout; + +/* + * sb->s_flags. Note that these mirror the equivalent MS_* flags where + * represented in both. + */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) +#define SB_DYING BIT(24) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) + +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + +#if IS_ENABLED(CONFIG_UNICODE) +#define sb_no_casefold_compat_fallback(sb) \ + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) +#else +#define sb_no_casefold_compat_fallback(sb) (1) +#endif + +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ + +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 +#define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 + +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ + +#endif /* _LINUX_FS_SUPER_TYPES_H */ diff --git a/include/linux/fs_types.h b/include/linux/fs_dirent.h index 54816791196f..92f75c5bac19 100644 --- a/include/linux/fs_types.h +++ b/include/linux/fs_dirent.h @@ -1,6 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FS_TYPES_H -#define _LINUX_FS_TYPES_H +#ifndef _LINUX_FS_DIRENT_H +#define _LINUX_FS_DIRENT_H + +#include <linux/stat.h> +#include <linux/types.h> /* * This is a header for the common implementation of dirent @@ -66,10 +69,10 @@ /* * declarations for helper functions, accompanying implementation - * is in fs/fs_types.c + * is in fs/fs_dirent.c */ extern unsigned char fs_ftype_to_dtype(unsigned int filetype); extern unsigned char fs_umode_to_ftype(umode_t mode); extern unsigned char fs_umode_to_dtype(umode_t mode); -#endif +#endif /* _LINUX_FS_DIRENT_H */ diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index baf200ab5c77..0070764b790a 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H +#include <linux/sched.h> #include <linux/path.h> #include <linux/spinlock.h> #include <linux/seqlock.h> @@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) extern bool current_chrooted(void); +static inline int current_umask(void) +{ + return current->fs->umask; +} + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7ded7df6e9b5..07f8c309e432 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -193,6 +193,10 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs #if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \ defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS) +#ifndef arch_ftrace_partial_regs +#define arch_ftrace_partial_regs(regs) do {} while (0) +#endif + static __always_inline struct pt_regs * ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) { @@ -202,7 +206,11 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) * Since arch_ftrace_get_regs() will check some members and may return * NULL, we can not use it. */ - return &arch_ftrace_regs(fregs)->regs; + regs = &arch_ftrace_regs(fregs)->regs; + + /* Allow arch specific updates to regs. */ + arch_ftrace_partial_regs(regs); + return regs; } #endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0ceb4e09306c..623bee335383 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -7,6 +7,7 @@ #include <linux/mmzone.h> #include <linux/topology.h> #include <linux/alloc_tag.h> +#include <linux/cleanup.h> #include <linux/sched.h> struct vm_area_struct; @@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, /* This should be paired with folio_put() rather than free_contig_range(). */ #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) +DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) + #endif /* __LINUX_GFP_H */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 105cc4c00cc3..abc20f9810fd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page) kunmap_local(kaddr); } -#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -static inline void tag_clear_highpage(struct page *page) +/* Return false to let people know we did not initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages) { + return false; } #endif diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc985..71ac78b9f834 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); /* - * try_folio_split - try to split a @folio at @page using non uniform split. + * try_folio_split_to_order - try to split a @folio at @page to @new_order using + * non uniform split. * @folio: folio to be split - * @page: split to order-0 at the given page - * @list: store the after-split folios + * @page: split to @new_order at the given page + * @new_order: the target split order * - * Try to split a @folio at @page using non uniform split to order-0, if - * non uniform split is not supported, fall back to uniform split. + * Try to split a @folio at @page using non uniform split to @new_order, if + * non uniform split is not supported, fall back to uniform split. After-split + * folios are put back to LRU list. Use min_order_for_split() to get the lower + * bound of @new_order. * * Return: 0: split is successful, otherwise split failed. */ -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - if (!non_uniform_split_supported(folio, 0, false)) - return split_huge_page_to_list_to_order(&folio->page, list, - ret); - return folio_split(folio, ret, page, list); + if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + return split_huge_page_to_list_to_order(&folio->page, NULL, + new_order); + return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) { - struct folio *folio = page_folio(page); - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - /* - * split_huge_page() locks the page before splitting and - * expects the same page that has been split to be locked when - * returned. split_folio(page_folio(page)) cannot be used here - * because it converts the page to folio and passes the head - * page to be split. - */ - return split_huge_page_to_list_to_order(page, NULL, ret); + return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); @@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page) return -EINVAL; } +static inline int min_order_for_split(struct folio *folio) +{ + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; +} + static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; } -static inline int try_folio_split(struct folio *folio, struct page *page, - struct list_head *list) +static inline int try_folio_split_to_order(struct folio *folio, + struct page *page, unsigned int new_order) { VM_WARN_ON_ONCE_FOLIO(1, folio); return -EINVAL; diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 5eb66a399002..4f33e6a39797 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer, size_t size, bool cyclic); void iio_dma_buffer_lock_queue(struct iio_buffer *buffer); void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer); #endif diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..8d770ced66b2 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -50,6 +50,7 @@ struct sg_table; * @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF * object to this buffer. Requires a valid DMABUF fd, that * was previouly attached to this buffer. + * @get_dma_dev: called to get the DMA channel associated with this buffer. * @lock_queue: called when the core needs to lock the buffer queue; * it is used when enqueueing DMABUF objects. * @unlock_queue: used to unlock a previously locked buffer queue @@ -90,6 +91,7 @@ struct iio_buffer_access_funcs { struct iio_dma_buffer_block *block, struct dma_fence *fence, struct sg_table *sgt, size_t size, bool cyclic); + struct device * (*get_dma_dev)(struct iio_buffer *buffer); void (*lock_queue)(struct iio_buffer *buffer); void (*unlock_queue)(struct iio_buffer *buffer); diff --git a/include/linux/init.h b/include/linux/init.h index 17c1bc712e23..40331923b9f4 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -200,12 +200,13 @@ extern struct module __this_module; /* Format: <modname>__<counter>_<line>_<fn> */ #define __initcall_id(fn) \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(__COUNTER__, \ __PASTE(_, \ __PASTE(__LINE__, \ - __PASTE(_, fn)))))) + __PASTE(_, fn))))))) /* Format: __<prefix>__<iid><id> */ #define __initcall_name(prefix, __iid, id) \ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bccb3f1f6262..a6cb241ea00c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #define INIT_PREV_CPUTIME(x) .prev_cputime = { \ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 51b6484c0493..266f2b39213a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device + * @affinity: CPUs this irqaction is allowed to run on * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) @@ -121,8 +122,11 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); */ struct irqaction { irq_handler_t handler; - void *dev_id; - void __percpu *percpu_dev_id; + union { + void *dev_id; + void __percpu *percpu_dev_id; + }; + const struct cpumask *affinity; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; @@ -179,7 +183,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *percpu_dev_id); + const cpumask_t *affinity, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, @@ -190,12 +194,21 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, - devname, percpu_dev_id); + devname, NULL, percpu_dev_id); +} + +static inline int __must_check +request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, + const char *devname, const cpumask_t *affinity, + void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, affinity, percpu_dev_id); } extern int __must_check -request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev); +request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 2b8026a39906..9d5791e9f737 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -20,6 +20,10 @@ interval_tree_remove(struct interval_tree_node *node, struct rb_root_cached *root); extern struct interval_tree_node * +interval_tree_subtree_search(struct interval_tree_node *node, + unsigned long start, unsigned long last); + +extern struct interval_tree_node * interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 1b400f26f63d..c5a2fed49eb0 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 73dceabc21c8..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/mm_types.h> #include <linux/blkdev.h> +#include <linux/pagevec.h> struct address_space; struct fiemap_extent_info; @@ -16,6 +17,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -241,11 +243,12 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + struct folio_batch *fbatch; void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); -int iomap_iter_advance(struct iomap_iter *iter, u64 *count); +int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration @@ -282,9 +285,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { - u64 length = iomap_length(iter); - - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, iomap_length(iter)); } /** @@ -339,8 +340,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +void iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -349,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); +loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, + loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); @@ -430,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, @@ -467,14 +476,41 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -518,6 +554,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); @@ -540,4 +584,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..6ab913e57da0 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -2,11 +2,12 @@ #ifndef __LINUX_IRQENTRYCOMMON_H #define __LINUX_IRQENTRYCOMMON_H +#include <linux/context_tracking.h> +#include <linux/kmsan.h> +#include <linux/rseq_entry.h> #include <linux/static_call_types.h> #include <linux/syscalls.h> -#include <linux/context_tracking.h> #include <linux/tick.h> -#include <linux/kmsan.h> #include <linux/unwind_deferred.h> #include <asm/entry-common.h> @@ -29,7 +30,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @@ -67,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; } /** * enter_from_user_mode - Establish state when coming from user mode + * @regs: Pointer to currents pt_regs * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -195,14 +197,11 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - */ -unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work); +/* Handle pending TIF work */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack * * 1) check that interrupts are disabled @@ -210,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * 3) call exit_to_user_mode_loop() if any flags from * EXIT_TO_USER_MODE_WORK are set * 4) check that interrupts are still disabled + * + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long ti_work; @@ -225,13 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); +} +static __always_inline void __exit_to_user_mode_validate(void) +{ /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } +/* Temporary workaround to keep ARM64 alive */ +static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode_legacy(); + __exit_to_user_mode_validate(); +} + +/** + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_syscall_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_irqentry_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -253,11 +293,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) static __always_inline void exit_to_user_mode(void) { instrumentation_begin(); + unwind_reset_info(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - unwind_reset_info(); user_enter_irqoff(); arch_exit_to_user_mode(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -274,7 +314,11 @@ static __always_inline void exit_to_user_mode(void) * * The function establishes state (lockdep, RCU (context tracking), tracing) */ -void irqentry_enter_from_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + rseq_note_user_irq_entry(); +} /** * irqentry_exit_to_user_mode - Interrupt exit work @@ -289,7 +333,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); * Interrupt exit is not invoking #1 which is the syscall specific one time * work. */ -void irqentry_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + irqentry_exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} #ifndef irqentry_state /** @@ -354,6 +404,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void raw_irqentry_exit_cond_resched(void); + #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched diff --git a/include/linux/irq.h b/include/linux/irq.h index c67e76fbcc07..4a9f1d7b08c3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); -extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); @@ -719,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq, } extern int irq_set_percpu_devid(unsigned int irq); -extern int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity); -extern int irq_get_percpu_devid_partition(unsigned int irq, - struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 136f2980cba3..c5afd053ae32 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,8 +2,9 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include <linux/smp_types.h> +#include <linux/irq_work_types.h> #include <linux/rcuwait.h> +#include <linux/smp_types.h> /* * An entry can be in one of four states: @@ -14,12 +15,6 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); - struct rcuwait irqwait; -}; - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h new file mode 100644 index 000000000000..73abec5bb06e --- /dev/null +++ b/include/linux/irq_work_types.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQ_WORK_TYPES_H +#define _LINUX_IRQ_WORK_TYPES_H + +#include <linux/smp_types.h> +#include <linux/types.h> + +struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); + struct rcuwait irqwait; +}; + +#endif diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index d5e6024cb2a8..bc4ddacd6ddc 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -17,12 +17,18 @@ #include <linux/of_irq.h> #include <linux/platform_device.h> +typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *); + /* Undefined on purpose */ extern of_irq_init_cb_t typecheck_irq_init_cb; +extern platform_irq_probe_t typecheck_irq_probe; #define typecheck_irq_init_cb(fn) \ (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) +#define typecheck_irq_probe(fn) \ + (__typecheck(typecheck_irq_probe, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev); static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ - .data = typecheck_irq_init_cb(fn), }, + .data = typecheck_irq_probe(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h deleted file mode 100644 index b35ee22c278f..000000000000 --- a/include/linux/irqchip/irq-partition-percpu.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H -#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H - -#include <linux/fwnode.h> -#include <linux/cpumask_types.h> -#include <linux/irqdomain.h> - -struct partition_affinity { - cpumask_t mask; - void *partition_id; -}; - -struct partition_desc; - -#ifdef CONFIG_PARTITION_PERCPU -int partition_translate_id(struct partition_desc *desc, void *partition_id); -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops); -struct irq_domain *partition_get_domain(struct partition_desc *dsc); -#else -static inline int partition_translate_id(struct partition_desc *desc, - void *partition_id) -{ - return -EINVAL; -} - -static inline -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - return NULL; -} - -static inline -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - return NULL; -} -#endif - -#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index fd091c35d572..37e0b5b5600a 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -82,7 +82,6 @@ struct irq_desc { int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; - const struct cpumask *percpu_affinity; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4a86e6b915dd..952d3c8dd6b7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -44,6 +44,23 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/** + * struct irq_fwspec_info - firmware provided IRQ information structure + * + * @flags: Information validity flags + * @affinity: Affinity mask for this interrupt + * + * This structure reports firmware-specific information about an + * interrupt. The only significant information is the affinity of a + * per-CPU interrupt, but this is designed to be extended as required. + */ +struct irq_fwspec_info { + unsigned long flags; + const struct cpumask *affinity; +}; + +#define IRQ_FWSPEC_INFO_AFFINITY_VALID BIT(0) + /* Conversion function from of_phandle_args fields to fwspec */ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec); @@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and * linux irq type value (@out_type). This is a generalised @xlate * (over struct irq_fwspec) and is preferred if provided. + * @get_fwspec_info: + * Given @fwspec, report additional firmware-provided information in + * @info. Optional. * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping @@ -96,6 +116,7 @@ struct irq_domain_ops { void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); + int (*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void (*debug_show)(struct seq_file *m, struct irq_domain *d, @@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; @@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) return false; } +static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + return -EINVAL; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_MSI_IRQ @@ -703,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif -/* Deprecated functions. Will be removed in the merge window */ -static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) -{ - return node ? &node->fwnode : NULL; -} - static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 51a258c24ff5..772919e8096a 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -13,6 +13,7 @@ #include <linux/ftrace.h> #include <linux/completion.h> #include <linux/list.h> +#include <linux/livepatch_external.h> #include <linux/livepatch_sched.h> #if IS_ENABLED(CONFIG_LIVEPATCH) @@ -77,30 +78,6 @@ struct klp_func { bool transition; }; -struct klp_object; - -/** - * struct klp_callbacks - pre/post live-(un)patch callback structure - * @pre_patch: executed before code patching - * @post_patch: executed after code patching - * @pre_unpatch: executed before code unpatching - * @post_unpatch: executed after code unpatching - * @post_unpatch_enabled: flag indicating if post-unpatch callback - * should run - * - * All callbacks are optional. Only the pre-patch callback, if provided, - * will be unconditionally executed. If the parent klp_object fails to - * patch for any reason, including a non-zero error status returned from - * the pre-patch callback, no further callbacks will be executed. - */ -struct klp_callbacks { - int (*pre_patch)(struct klp_object *obj); - void (*post_patch)(struct klp_object *obj); - void (*pre_unpatch)(struct klp_object *obj); - void (*post_unpatch)(struct klp_object *obj); - bool post_unpatch_enabled; -}; - /** * struct klp_object - kernel object structure for live patching * @name: module name (or NULL for vmlinux) diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include <linux/types.h> + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h new file mode 100644 index 000000000000..99d68d0773fa --- /dev/null +++ b/include/linux/livepatch_helpers.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIVEPATCH_HELPERS_H +#define _LINUX_LIVEPATCH_HELPERS_H + +/* + * Interfaces for use by livepatch patches + */ + +#include <linux/syscalls.h> +#include <linux/livepatch.h> + +#ifdef MODULE +#define KLP_OBJNAME __KBUILD_MODNAME +#else +#define KLP_OBJNAME vmlinux +#endif + +/* Livepatch callback registration */ + +#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs" + +#define KLP_PRE_PATCH_CALLBACK(func) \ + klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_PATCH_CALLBACK(func) \ + klp_post_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_PRE_UNPATCH_CALLBACK(func) \ + klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_UNPATCH_CALLBACK(func) \ + klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func + +/* + * Replace static_call() usage with this macro when create-diff-object + * recommends it due to the original static call key living in a module. + * + * This converts the static call to a regular indirect call. + */ +#define KLP_STATIC_CALL(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +/* Syscall patching */ + +#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) + +#define KLP_SYSCALL_DEFINEx(x, sname, ...) \ + __KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +#ifdef CONFIG_X86_64 +// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code +#define __KLP_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __X64_SYS_STUBx(x, name, __VA_ARGS__) \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#endif + +#endif /* _LINUX_LIVEPATCH_HELPERS_H */ diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 0d91d060e3e9..b0e6ab329b00 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -6,6 +6,7 @@ /** * local_lock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_lock_init(lock) __local_lock_init(lock) @@ -52,7 +53,8 @@ __local_unlock_irqrestore(this_cpu_ptr(lock), flags) /** - * local_lock_init - Runtime initialize a lock instance + * local_trylock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_trylock_init(lock) __local_trylock_init(lock) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index a4dc479157b5..8f82b4eb542f 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -99,18 +99,18 @@ do { \ #define __local_lock_acquire(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 0); \ - WRITE_ONCE(tl->acquired, 1); \ + lockdep_assert(__tl->acquired == 0); \ + WRITE_ONCE(__tl->acquired, 1); \ }), \ local_lock_t *: (void)0); \ - local_lock_acquire(l); \ + local_lock_acquire(__l); \ } while (0) #define __local_lock(lock) \ @@ -133,36 +133,36 @@ do { \ #define __local_trylock(lock) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ preempt_disable(); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ preempt_enable(); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ local_irq_save(flags); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ local_irq_restore(flags); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) /* preemption or migration must be disabled before calling __local_lock_is_locked */ @@ -170,16 +170,16 @@ do { \ #define __local_lock_release(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ - local_lock_release(l); \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ + local_lock_release(__l); \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 1); \ - WRITE_ONCE(tl->acquired, 0); \ + lockdep_assert(__tl->acquired == 1); \ + WRITE_ONCE(__tl->acquired, 0); \ }), \ local_lock_t *: (void)0); \ } while (0) @@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) -#define __local_lock_init(l) \ +#define __local_lock_init(__l) \ do { \ - local_spin_lock_init((l)); \ + local_spin_lock_init((__l)); \ } while (0) -#define __local_trylock_init(l) __local_lock_init(l) +#define __local_trylock_init(__l) __local_lock_init(__l) #define __local_lock(__lock) \ do { \ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 67964dc4db95..dd634103b014 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -616,7 +616,7 @@ do { \ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ - (!in_softirq() || in_irq() || in_nmi())); \ + (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 4c1a91b07de3..e1555e06e7e5 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -77,6 +77,16 @@ struct cmdq_pkt { size_t buf_size; /* real buffer size */ }; +/** + * cmdq_get_shift_pa() - get the shift bits of physical address + * @chan: mailbox channel + * + * GCE can only fetch the command buffer address from a 32-bit register. + * Some SOCs support more than 32-bit command buffer address for GCE, which + * requires some shift bits to make the address fit into the 32-bit register. + * + * Return: the shift bits of physical address + */ u8 cmdq_get_shift_pa(struct mbox_chan *chan); #endif /* __MTK_CMDQ_MAILBOX_H__ */ diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h index 62674c83bde4..48e2ff95332f 100644 --- a/include/linux/map_benchmark.h +++ b/include/linux/map_benchmark.h @@ -27,5 +27,6 @@ struct map_benchmark { __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; #endif /* _KERNEL_DMA_BENCHMARK_H */ diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 7ef2c7c7d803..9d47cdc727ad 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -183,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe); int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..8dc0a07570cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2074,7 +2074,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) /* * We don't expect any folios that exceed buddy sizes (and consequently * memory sections). @@ -2087,10 +2087,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio) * pages are guaranteed to be contiguous. */ #define MAX_FOLIO_ORDER PFN_SECTION_SHIFT -#else +#elif defined(CONFIG_HUGETLB_PAGE) /* * There is no real limit on the folio size. We limit them to the maximum we - * currently expect (e.g., hugetlb, dax). + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. */ #define MAX_FOLIO_ORDER PUD_ORDER #endif @@ -2401,31 +2408,6 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -#ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_before_execve(struct task_struct *t); -void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) -{ - return t->mm_cid; -} -#else -static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) -{ - /* - * Use the processor id as a fall-back when the mm cid feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); -} -#endif - #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else @@ -3369,6 +3351,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, + unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, @@ -3495,10 +3479,10 @@ struct vm_unmapped_area_info { extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ -extern void truncate_inode_pages(struct address_space *, loff_t); -extern void truncate_inode_pages_range(struct address_space *, - loff_t lstart, loff_t lend); -extern void truncate_inode_pages_final(struct address_space *); +void truncate_inode_pages(struct address_space *mapping, loff_t lstart); +void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, + uoff_t lend); +void truncate_inode_pages_final(struct address_space *mapping); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..3b7d05e7169c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <linux/types.h> +#include <linux/rseq_types.h> #include <linux/bitmap.h> #include <asm/mmu.h> @@ -922,14 +923,6 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -#ifdef CONFIG_SCHED_MM_CID -struct mm_cid { - u64 time; - int cid; - int recent_cid; -}; -#endif - /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. @@ -991,44 +984,9 @@ struct mm_struct { */ atomic_t mm_users; -#ifdef CONFIG_SCHED_MM_CID - /** - * @pcpu_cid: Per-cpu current cid. - * - * Keep track of the currently allocated mm_cid for each cpu. - * The per-cpu mm_cid values are serialized by their respective - * runqueue locks. - */ - struct mm_cid __percpu *pcpu_cid; - /* - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). - * - * When the next mm_cid scan is due (in jiffies). - */ - unsigned long mm_cid_next_scan; - /** - * @nr_cpus_allowed: Number of CPUs allowed for mm. - * - * Number of CPUs allowed in the union of all mm's - * threads allowed CPUs. - */ - unsigned int nr_cpus_allowed; - /** - * @max_nr_cid: Maximum number of allowed concurrency - * IDs allocated. - * - * Track the highest number of allowed concurrency IDs - * allocated for the mm. - */ - atomic_t max_nr_cid; - /** - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. - * - * Provide mutual exclusion for mm cpus_allowed and - * mm nr_cpus_allowed updates. - */ - raw_spinlock_t cpus_allowed_lock; -#endif + /* MM CID related storage */ + struct mm_mm_cid mm_cid; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1370,37 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi, } #ifdef CONFIG_SCHED_MM_CID - -enum mm_cid_state { - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ - MM_CID_LAZY_PUT = (1U << 31), -}; - -static inline bool mm_cid_is_unset(int cid) -{ - return cid == MM_CID_UNSET; -} - -static inline bool mm_cid_is_lazy_put(int cid) -{ - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); -} - -static inline bool mm_cid_is_valid(int cid) -{ - return !(cid & MM_CID_LAZY_PUT); -} - -static inline int mm_cid_set_lazy_put(int cid) -{ - return cid | MM_CID_LAZY_PUT; -} - -static inline int mm_cid_clear_lazy_put(int cid) -{ - return cid & ~MM_CID_LAZY_PUT; -} - /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ @@ -1415,37 +1342,21 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) } /* Accessor for struct mm_struct's cidmask. */ -static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); - return (struct cpumask *)cid_bitmap; + return (unsigned long *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) -{ - int i; - - for_each_possible_cpu(i) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); - - pcpu_cid->cid = MM_CID_UNSET; - pcpu_cid->recent_cid = MM_CID_UNSET; - pcpu_cid->time = 0; - } - mm->nr_cpus_allowed = p->nr_cpus_allowed; - atomic_set(&mm->max_nr_cid, 0); - raw_spin_lock_init(&mm->cpus_allowed_lock); - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - cpumask_clear(mm_cidmask(mm)); -} +void mm_init_cid(struct mm_struct *mm, struct task_struct *p); static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); - if (!mm->pcpu_cid) + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); + if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; @@ -1454,37 +1365,24 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * static inline void mm_destroy_cid(struct mm_struct *mm) { - free_percpu(mm->pcpu_cid); - mm->pcpu_cid = NULL; + free_percpu(mm->mm_cid.pcpu); + mm->mm_cid.pcpu = NULL; } static inline unsigned int mm_cid_size(void) { - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ + /* mm_cpus_allowed(), mm_cidmask(). */ + return cpumask_size() + bitmap_size(num_possible_cpus()); } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) -{ - struct cpumask *mm_allowed = mm_cpus_allowed(mm); - - if (!mm) - return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - raw_spin_lock(&mm->cpus_allowed_lock); - cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); - raw_spin_unlock(&mm->cpus_allowed_lock); -} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } - static inline unsigned int mm_cid_size(void) { return 0; } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; diff --git a/include/linux/module.h b/include/linux/module.h index e135cc79acee..d80c3ea57472 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); */ #define __mod_device_table(type, name) \ __PASTE(__mod_device_table__, \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(type, \ - __PASTE(__, name))))) + __PASTE(__, name)))))) /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ diff --git a/include/linux/msi.h b/include/linux/msi.h index d415dd15a0a9..8003e3218c46 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 847b81ca6436..bf535f0118bb 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -86,8 +86,23 @@ do { \ #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_lockep(lock, name, key); +} +#else +extern void mutex_init_generic(struct mutex *lock); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_generic(lock); +} +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ /** * mutex_is_locked - is the mutex locked @@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock); #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_rt_init(struct mutex *lock, const char *name, - struct lock_class_key *key); - #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) -#define __mutex_init(mutex, name, key) \ -do { \ - rt_mutex_base_init(&(mutex)->rtmutex); \ - __mutex_rt_init((mutex), name, key); \ -} while (0) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, + struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_lockdep(lock, name, key); +} +#else +extern void mutex_rt_init_generic(struct mutex *mutex); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_generic(lock); +} +#endif /* !CONFIG_LOCKDEP */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..58600cf234bc 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -7,6 +7,7 @@ #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> +#include <linux/fs_struct.h> enum { MAX_NESTED_LINKS = 8 }; @@ -88,6 +89,81 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child); +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child); + +/* end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and release the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * + * If vfs_mkdir() was not called, then @child will be a valid dentry and + * @parent will be ignored. + */ +static inline void end_creating(struct dentry *child) +{ + end_dirop(child); +} + +/* end_creating_keep - finish action started with start_creating() and return result + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and return the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * Returns: @child, which may be a dentry or an error. + * + */ +static inline struct dentry *end_creating_keep(struct dentry *child) +{ + if (!IS_ERR(child)) + dget(child); + end_dirop(child); + return child; +} + +/** + * end_removing - finish action started with start_removing + * @child: dentry returned by start_removing() + * @parent: dentry given to start_removing() + * + * Unlock and release the child. + * + * This is identical to end_dirop(). It can be passed the result of + * start_removing() whether that was successful or not, but it not needed + * if start_removing() failed. + */ +static inline void end_removing(struct dentry *child) +{ + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); @@ -95,6 +171,13 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last); +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last); +int start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry); +void end_renaming(struct renamedata *rd); /** * mode_strip_umask - handle vfs umask stripping diff --git a/include/linux/net/intel/libie/fwlog.h b/include/linux/net/intel/libie/fwlog.h index 36b13fabca9e..7273c78c826b 100644 --- a/include/linux/net/intel/libie/fwlog.h +++ b/include/linux/net/intel/libie/fwlog.h @@ -78,8 +78,20 @@ struct libie_fwlog { ); }; +#if IS_ENABLED(CONFIG_LIBIE_FWLOG) int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); void libie_fwlog_deinit(struct libie_fwlog *fwlog); void libie_fwlog_reregister(struct libie_fwlog *fwlog); void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#else +static inline int libie_fwlog_init(struct libie_fwlog *fwlog, + struct libie_fwlog_api *api) +{ + return -EOPNOTSUPP; +} +static inline void libie_fwlog_deinit(struct libie_fwlog *fwlog) { } +static inline void libie_fwlog_reregister(struct libie_fwlog *fwlog) { } +static inline void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, + u16 len) { } +#endif /* CONFIG_LIBIE_FWLOG */ #endif /* _LIBIE_FWLOG_H_ */ diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h new file mode 100644 index 000000000000..b332b019b29c --- /dev/null +++ b/include/linux/ns/ns_common_types.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NS_COMMON_TYPES_H +#define _LINUX_NS_COMMON_TYPES_H + +#include <linux/atomic.h> +#include <linux/ns/nstree_types.h> +#include <linux/rbtree.h> +#include <linux/refcount.h> +#include <linux/types.h> + +struct cgroup_namespace; +struct dentry; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct proc_ns_operations; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations utsns_operations; + +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + * + * @ns_type: type of namespace (e.g., CLONE_NEWNET) + * @stashed: cached dentry to be used by the vfs + * @ops: namespace operations + * @inum: namespace inode number (quickly recycled for non-initial namespaces) + * @__ns_ref: main reference count (do not use directly) + * @ns_tree: namespace tree nodes and active reference count + */ +struct ns_common { + u32 ns_type; + struct dentry *stashed; + const struct proc_ns_operations *ops; + unsigned int inum; + refcount_t __ns_ref; /* do not use directly */ + union { + struct ns_tree; + struct rcu_head ns_rcu; + }; +}; + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#endif /* _LINUX_NS_COMMON_TYPES_H */ diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h new file mode 100644 index 000000000000..2fb28ee31efb --- /dev/null +++ b/include/linux/ns/nstree_types.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ +#ifndef _LINUX_NSTREE_TYPES_H +#define _LINUX_NSTREE_TYPES_H + +#include <linux/rbtree.h> +#include <linux/list.h> + +/** + * struct ns_tree_root - Root of a namespace tree + * @ns_rb: Red-black tree root for efficient lookups + * @ns_list_head: List head for sequential iteration + * + * Each namespace tree maintains both an rbtree (for O(log n) lookups) + * and a list (for efficient sequential iteration). The list is kept in + * the same sorted order as the rbtree. + */ +struct ns_tree_root { + struct rb_root ns_rb; + struct list_head ns_list_head; +}; + +/** + * struct ns_tree_node - Node in a namespace tree + * @ns_node: Red-black tree node + * @ns_list_entry: List entry for sequential iteration + * + * Represents a namespace's position in a tree. Each namespace has + * multiple tree nodes for different trees (unified, per-type, owner). + */ +struct ns_tree_node { + struct rb_node ns_node; + struct list_head ns_list_entry; +}; + +/** + * struct ns_tree - Namespace tree nodes and active reference count + * @ns_id: Unique namespace identifier + * @__ns_ref_active: Active reference count (do not use directly) + * @ns_unified_node: Node in the global namespace tree + * @ns_tree_node: Node in the per-type namespace tree + * @ns_owner_node: Node in the owner namespace's tree of owned namespaces + * @ns_owner_root: Root of the tree of namespaces owned by this namespace + * (only used when this namespace is an owner) + */ +struct ns_tree { + u64 ns_id; + atomic_t __ns_ref_active; + struct ns_tree_node ns_unified_node; + struct ns_tree_node ns_tree_node; + struct ns_tree_node ns_owner_node; + struct ns_tree_root ns_owner_root; +}; + +#endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f5b68b8abb54..825f5865bfc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -2,122 +2,44 @@ #ifndef _LINUX_NS_COMMON_H #define _LINUX_NS_COMMON_H +#include <linux/ns/ns_common_types.h> #include <linux/refcount.h> -#include <linux/rbtree.h> +#include <linux/vfsdebug.h> #include <uapi/linux/sched.h> +#include <uapi/linux/nsfs.h> -struct proc_ns_operations; - -struct cgroup_namespace; -struct ipc_namespace; -struct mnt_namespace; -struct net; -struct pid_namespace; -struct time_namespace; -struct user_namespace; -struct uts_namespace; - -extern struct cgroup_namespace init_cgroup_ns; -extern struct ipc_namespace init_ipc_ns; -extern struct mnt_namespace init_mnt_ns; -extern struct net init_net; -extern struct pid_namespace init_pid_ns; -extern struct time_namespace init_time_ns; -extern struct user_namespace init_user_ns; -extern struct uts_namespace init_uts_ns; - -extern const struct proc_ns_operations netns_operations; -extern const struct proc_ns_operations utsns_operations; -extern const struct proc_ns_operations ipcns_operations; -extern const struct proc_ns_operations pidns_operations; -extern const struct proc_ns_operations pidns_for_children_operations; -extern const struct proc_ns_operations userns_operations; -extern const struct proc_ns_operations mntns_operations; -extern const struct proc_ns_operations cgroupns_operations; -extern const struct proc_ns_operations timens_operations; -extern const struct proc_ns_operations timens_for_children_operations; - -struct ns_common { - u32 ns_type; - struct dentry *stashed; - const struct proc_ns_operations *ops; - unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ - union { - struct { - u64 ns_id; - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct rcu_head ns_rcu; - }; -}; - +bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +struct ns_common *__must_check ns_owner(struct ns_common *ns); + +static __always_inline bool is_ns_init_inum(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->inum == 0); + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); +} + +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - const struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - const struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - const struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - const struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - const struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - const struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - const struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns, \ - const struct uts_namespace *: &(__ns)->ns) - -#define ns_init_inum(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ - struct ipc_namespace *: IPC_NS_INIT_INO, \ - struct mnt_namespace *: MNT_NS_INIT_INO, \ - struct net *: NET_NS_INIT_INO, \ - struct pid_namespace *: PID_NS_INIT_INO, \ - struct time_namespace *: TIME_NS_INIT_INO, \ - struct user_namespace *: USER_NS_INIT_INO, \ - struct uts_namespace *: UTS_NS_INIT_INO) - -#define ns_init_ns(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &init_cgroup_ns, \ - struct ipc_namespace *: &init_ipc_ns, \ - struct mnt_namespace *: &init_mnt_ns, \ - struct net *: &init_net, \ - struct pid_namespace *: &init_pid_ns, \ - struct time_namespace *: &init_time_ns, \ - struct user_namespace *: &init_user_ns, \ - struct uts_namespace *: &init_uts_ns) - -#define to_ns_operations(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ - struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ - struct mnt_namespace *: &mntns_operations, \ - struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ - struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ - struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ - struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ - struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) - -#define ns_common_type(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CLONE_NEWCGROUP, \ - struct ipc_namespace *: CLONE_NEWIPC, \ - struct mnt_namespace *: CLONE_NEWNS, \ - struct net *: CLONE_NEWNET, \ - struct pid_namespace *: CLONE_NEWPID, \ - struct time_namespace *: CLONE_NEWTIME, \ - struct user_namespace *: CLONE_NEWUSER, \ - struct uts_namespace *: CLONE_NEWUTS) +#define NS_COMMON_INIT(nsname) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = ns_init_id(&nsname), \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(1), \ + .__ns_ref_active = ATOMIC_INIT(1), \ + .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ + .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ + .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ + .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ +} #define ns_common_init(__ns) \ __ns_common_init(to_ns_common(__ns), \ @@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) +{ + return atomic_read(&ns->__ns_ref_active); +} + +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->__ns_ref); + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return false; + } + if (refcount_dec_and_test(&ns->__ns_ref)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return true; + } + return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->__ns_ref); + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return true; + } + if (refcount_inc_not_zero(&ns->__ns_ref)) + return true; + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return false; } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) -#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) -#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -#define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +static __always_inline void __ns_ref_inc(struct ns_common *ns) +{ + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return; + } + refcount_inc(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, + spinlock_t *ns_lock) +{ + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); + return false; + } + return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); +} + +#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) +#define ns_ref_inc(__ns) \ + do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) +#define ns_ref_get(__ns) \ + ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) +#define ns_ref_put(__ns) \ + ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) +#define ns_ref_put_and_lock(__ns, __ns_lock) \ + ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) + +#define ns_ref_active_read(__ns) \ + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) + +void __ns_ref_active_put(struct ns_common *ns); + +#define ns_ref_active_put(__ns) \ + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) + +static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) +{ + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + return NULL; + } + if (!__ns_ref_get(ns)) + return NULL; + return ns; +} + +void __ns_ref_active_get(struct ns_common *ns); + +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index e5a5fa83d36b..731b67fc2fec 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -37,4 +37,7 @@ void nsfs_init(void); #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +void nsproxy_ns_active_get(struct nsproxy *ns); +void nsproxy_ns_active_put(struct nsproxy *ns); + #endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index bd118a187dec..5a67648721c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); -void exit_task_namespaces(struct task_struct *tsk); +void switch_cred_namespaces(const struct cred *old, const struct cred *new); +void exit_nsproxy_namespaces(struct task_struct *tsk); +void get_cred_namespaces(struct task_struct *tsk); +void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); -void free_nsproxy(struct nsproxy *ns); +void deactivate_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); @@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) - free_nsproxy(ns); + deactivate_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 8b8636690473..175e4625bfa6 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -1,22 +1,34 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H -#include <linux/ns_common.h> +#include <linux/ns/nstree_types.h> #include <linux/nsproxy.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rculist.h> #include <linux/cookie.h> +#include <uapi/linux/nsfs.h> -extern struct ns_tree cgroup_ns_tree; -extern struct ns_tree ipc_ns_tree; -extern struct ns_tree mnt_ns_tree; -extern struct ns_tree net_ns_tree; -extern struct ns_tree pid_ns_tree; -extern struct ns_tree time_ns_tree; -extern struct ns_tree user_ns_tree; -extern struct ns_tree uts_ns_tree; +struct ns_common; + +extern struct ns_tree_root cgroup_ns_tree; +extern struct ns_tree_root ipc_ns_tree; +extern struct ns_tree_root mnt_ns_tree; +extern struct ns_tree_root net_ns_tree; +extern struct ns_tree_root pid_ns_tree; +extern struct ns_tree_root time_ns_tree; +extern struct ns_tree_root user_ns_tree; +extern struct ns_tree_root uts_ns_tree; + +void ns_tree_node_init(struct ns_tree_node *node); +void ns_tree_root_init(struct ns_tree_root *root); +bool ns_tree_node_empty(const struct ns_tree_node *node); +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)); +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); #define to_ns_tree(__ns) \ _Generic((__ns), \ @@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree; struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) -u64 ns_tree_gen_id(struct ns_common *ns); -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +#define ns_tree_gen_id(__ns) \ + __ns_tree_gen_id(to_ns_common(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) + +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, + struct ns_tree_root *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) { - ns_tree_gen_id(ns); + __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); } @@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ -#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) +#define ns_tree_add(__ns) \ + __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) /** * ns_tree_remove - Remove a namespace from a namespace tree @@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) -#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) #endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 46ebaa46e6c5..b18ab53561c9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -3,16 +3,16 @@ #define _LINUX_OBJTOOL_H #include <linux/objtool_types.h> +#include <linux/annotate.h> #ifdef CONFIG_OBJTOOL -#include <asm/asm.h> - #ifndef __ASSEMBLY__ -#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ +#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ + ANNOTATE_DATA_SPECIAL \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ ".short " __stringify(sp_offset) "\n\t" \ @@ -53,16 +53,6 @@ #define __ASM_BREF(label) label ## b -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long " __stringify(label) " - .\n\t" \ - ".long " __stringify(type) "\n\t" \ - ".popsection\n\t" - -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ - __ASM_ANNOTATE(911b, type) - #else /* __ASSEMBLY__ */ /* @@ -89,6 +79,7 @@ .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .Lhere_\@: .pushsection .discard.unwind_hints + ANNOTATE_DATA_SPECIAL /* struct unwind_hint */ .long .Lhere_\@ - . .short \sp_offset @@ -101,7 +92,7 @@ .macro STACK_FRAME_NON_STANDARD func:req .pushsection .discard.func_stack_frame_non_standard, "aw" - .long \func - . + .quad \func .popsection .endm @@ -111,14 +102,6 @@ #endif .endm -.macro ANNOTATE type:req -.Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 - .long .Lhere_\@ - . - .long \type - .popsection -.endm - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -128,84 +111,15 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) "" -#define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE type:req -.endm #endif #endif /* CONFIG_OBJTOOL */ -#ifndef __ASSEMBLY__ -/* - * Annotate away the various 'relocation to !ENDBR` complaints; knowing that - * these relocations will never be used for indirect calls. - */ -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) - -/* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. - */ -#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) -/* - * See linux/instrumentation.h - */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) -/* - * This should be used to refer to an instruction that is considered - * terminating, like a noreturn CALL or UD2 when we know they are not -- eg - * WARN using UD2. - */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) -/* - * This should not be used; it annotates away CFI violations. There are a few - * valid use cases like kexec handover to the next kernel image, and there is - * no security concern there. - * - * There are also a few real issues annotated away, like EFI because we can't - * control the EFI code. - */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) - -#else -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE -/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ -/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ -#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL -#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE -#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI -#endif - #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) #define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1db8543dfc8a..1c2bc0281807 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); extern int of_irq_get(struct device_node *dev, int index); +extern const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); @@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name) { return 0; } +static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index) +{ + return NULL; +} static inline int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..e601a3144f28 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping, int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, @@ -53,14 +54,10 @@ static inline int filemap_fdatawait(struct address_space *mapping) bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) @@ -942,6 +939,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio) } /** + * folio_next_pos - Get the file position of the next folio. + * @folio: The current folio. + * + * Return: The position of the folio which follows this folio in the file. + */ +static inline loff_t folio_next_pos(const struct folio *folio) +{ + return (loff_t)folio_next_index(folio) << PAGE_SHIFT; +} + +/** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. * @index: The index we want to look up. @@ -977,6 +985,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); +unsigned filemap_get_folios_dirty(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..bf97d49c23cf 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -412,6 +412,8 @@ struct pci_dev { u16 l1ss; /* L1SS Capability pointer */ #ifdef CONFIG_PCIEASPM struct pcie_link_state *link_state; /* ASPM link state */ + unsigned int aspm_l0s_support:1; /* ASPM L0s support */ + unsigned int aspm_l1_support:1; /* ASPM L1 support */ unsigned int ltr_path:1; /* Latency Tolerance Reporting supported from root to here */ #endif diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..bab26a7d79f4 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -132,8 +132,6 @@ struct arm_pmu { #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) -DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu); - u64 armpmu_event_update(struct perf_event *event); int armpmu_event_set_period(struct perf_event *event); @@ -190,8 +188,8 @@ bool arm_pmu_irq_is_nmi(void); struct arm_pmu *armpmu_alloc(void); void armpmu_free(struct arm_pmu *pmu); int armpmu_register(struct arm_pmu *pmu); -int armpmu_request_irq(int irq, int cpu); -void armpmu_free_irq(int irq, int cpu); +int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fd1d91017b99..9870d768db4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 445517a72ad0..0e7ae12c96d2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { - if (ns != &init_pid_ns) - ns_ref_inc(ns); + ns_ref_inc(ns); return ns; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 9d42d473d201..7f6a92ac9704 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t; typedef unsigned short pipe_index_t; #endif -/* - * We have to declare this outside 'struct pipe_inode_info', - * but then we can't use 'union pipe_index' for an anonymous - * union, so we end up having to duplicate this declaration - * below. Annoying. +/** + * struct pipe_index - pipe indeces + * @head: The point of buffer production + * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; @@ -63,9 +63,7 @@ union pipe_index { * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption - * @head_tail: unsigned long union of @head and @tail + * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -87,14 +85,7 @@ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - /* This has to match the 'union pipe_index' above */ - union { - unsigned long head_tail; - struct { - pipe_index_t head; - pipe_index_t tail; - }; - }; + union pipe_index; unsigned int max_usage; unsigned int ring_size; diff --git a/include/linux/platform_data/x86/int3472.h b/include/linux/platform_data/x86/int3472.h index 1571e9157fa5..b1b837583d54 100644 --- a/include/linux/platform_data/x86/int3472.h +++ b/include/linux/platform_data/x86/int3472.h @@ -100,7 +100,6 @@ struct int3472_gpio_regulator { struct regulator_consumer_supply supply_map[GPIO_REGULATOR_SUPPLY_MAP_COUNT * 2]; char supply_name_upper[GPIO_SUPPLY_NAME_LENGTH]; char regulator_name[GPIO_REGULATOR_NAME_LENGTH]; - struct gpio_desc *ena_gpio; struct regulator_dev *rdev; struct regulator_desc rdesc; }; diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..ad66333ce85c 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, extern int platform_get_irq(struct platform_device *, unsigned int); extern int platform_get_irq_optional(struct platform_device *, unsigned int); +extern int platform_get_irq_affinity(struct platform_device *, unsigned int, + const struct cpumask **); extern int platform_irq_count(struct platform_device *); extern int devm_platform_get_irqs_affinity(struct platform_device *dev, struct irq_affinity *affd, diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 102202185d7a..d964f965c8ff 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void) /* * The following macros are deprecated and should not be used in new code: - * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ -#define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) diff --git a/include/linux/prmt.h b/include/linux/prmt.h index c53ab287e932..8cdc987de963 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -4,9 +4,11 @@ #ifdef CONFIG_ACPI_PRMT void init_prmt(void); +bool acpi_prm_handler_available(const guid_t *handler_guid); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); #else static inline void init_prmt(void) { } +static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; } static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer) { return -EOPNOTSUPP; diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 2503f7625d65..a651e60d9410 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -9,6 +9,7 @@ struct pseudo_fs_context { const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; + unsigned int s_d_flags; }; struct pseudo_fs_context *init_pseudo(struct fs_context *fc, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653..54701668b3df 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -206,6 +206,8 @@ struct rdt_mon_domain { * @arch_has_sparse_bitmasks: True if a bitmask like f00f is valid. * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache * level has CPU scope. + * @io_alloc_capable: True if portion of the cache can be configured + * for I/O traffic. */ struct resctrl_cache { unsigned int cbm_len; @@ -213,6 +215,7 @@ struct resctrl_cache { unsigned int shareable_bits; bool arch_has_sparse_bitmasks; bool arch_has_per_cpu_cfg; + bool io_alloc_capable; }; /** @@ -654,6 +657,27 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); +/** + * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature. + * @r: The resctrl resource. + * @enable: Enable (true) or disable (false) io_alloc on resource @r. + * + * This can be called from any CPU. + * + * Return: + * 0 on success, <0 on error. + */ +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable); + +/** + * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state. + * @r: The resctrl resource. + * + * Return: + * true if io_alloc is enabled or false if disabled. + */ +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..36ddfa1ec301 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -43,7 +43,7 @@ struct restart_block { struct __kernel_timespec __user *rmtp; struct old_timespec32 __user *compat_rmtp; }; - u64 expires; + ktime_t expires; } nanosleep; /* For poll */ struct { diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index e0135e0adae0..bf92227c78d0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(NULL, regs); + rseq_handle_slowpath(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 69553e7c14c1..2266f4dc77b6 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -3,134 +3,164 @@ #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ - -#include <linux/preempt.h> #include <linux/sched.h> -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} +#include <uapi/linux/rseq.h> -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +void __rseq_handle_slowpath(struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +/* Invoked from resume_user_mode_work() */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (current->rseq.event.slowpath) + __rseq_handle_slowpath(regs); + } else { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + __rseq_handle_slowpath(regs); + } } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - rseq_handle_notify_resume(ksig, regs); -} +void __rseq_signal_deliver(int sig, struct pt_regs *regs); -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +/* + * Invoked from signal delivery to fixup based on the register context before + * switching to the signal delivery context. + */ +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + __rseq_signal_deliver(ksig->sig, regs); + } else { + if (current->rseq.event.has_rseq) + __rseq_signal_deliver(ksig->sig, regs); + } } -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) +static inline void rseq_raise_notify_resume(struct task_struct *t) { - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + set_tsk_thread_flag(t, TIF_RSEQ); } -/* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. - */ -static inline void rseq_fork(struct task_struct *t, u64 clone_flags) +/* Invoked from context switch to force evaluation on exit to user */ +static __always_inline void rseq_sched_switch_event(struct task_struct *t) { - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; + struct rseq_event *ev = &t->rseq.event; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Avoid a boat load of conditionals by using simple logic + * to determine whether NOTIFY_RESUME needs to be raised. + * + * It's required when the CPU or MM CID has changed or + * the entry was from user space. + */ + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + + if (raise) { + ev->sched_switch = true; + rseq_raise_notify_resume(t); + } } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; + if (ev->has_rseq) { + t->rseq.event.sched_switch = true; + rseq_raise_notify_resume(t); + } } } -static inline void rseq_execve(struct task_struct *t) +/* + * Invoked from __set_task_cpu() when a task migrates or from + * mm_cid_schedin() when the CID changes to enforce an IDs update. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq.event.ids_changed = true; } -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +/* Enforce a full update after RSEQ registration and when execve() failed */ +static inline void rseq_force_update(void) { + if (current->rseq.event.has_rseq) { + current->rseq.event.ids_changed = true; + current->rseq.event.sched_switch = true; + rseq_raise_notify_resume(current); + } } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) + +/* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME on architectures that don't use the + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. + * + * To avoid updating user space RSEQ in that case just to do it eventually + * again before returning to user space, because __rseq_handle_slowpath() + * does nothing when invoked with NULL register state. + * + * After returning from guest mode, before exiting to userspace, hypervisors + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) { + /* + * The generic optimization for deferring RSEQ updates until the next + * exit relies on having a dedicated TIF_RSEQ. + */ + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && + current->rseq.event.sched_switch) + rseq_raise_notify_resume(current); } -static inline void rseq_preempt(struct task_struct *t) + +static inline void rseq_reset(struct task_struct *t) { + memset(&t->rseq, 0, sizeof(t->rseq)); + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } -static inline void rseq_migrate(struct task_struct *t) + +static inline void rseq_execve(struct task_struct *t) { + rseq_reset(t); } + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. + */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { -} -static inline void rseq_execve(struct task_struct *t) -{ + if (clone_flags & CLONE_VM) + rseq_reset(t); + else + t->rseq = current->rseq; } -#endif +#else /* CONFIG_RSEQ */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { } +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } +static inline void rseq_force_update(void) { } +static inline void rseq_virt_userspace_exit(void) { } +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } +static inline void rseq_execve(struct task_struct *t) { } +#endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ - void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif +#else /* CONFIG_DEBUG_RSEQ */ +static inline void rseq_syscall(struct pt_regs *regs) { } +#endif /* !CONFIG_DEBUG_RSEQ */ #endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h new file mode 100644 index 000000000000..c92167ff8a7f --- /dev/null +++ b/include/linux/rseq_entry.h @@ -0,0 +1,616 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_ENTRY_H +#define _LINUX_RSEQ_ENTRY_H + +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ +#ifdef CONFIG_RSEQ_STATS +#include <linux/percpu.h> + +struct rseq_stats { + unsigned long exit; + unsigned long signal; + unsigned long slowpath; + unsigned long fastpath; + unsigned long ids; + unsigned long cs; + unsigned long clear; + unsigned long fixup; +}; + +DECLARE_PER_CPU(struct rseq_stats, rseq_stats); + +/* + * Slow path has interrupts and preemption enabled, but the fast path + * runs with interrupts disabled so there is no point in having the + * preemption checks implied in __this_cpu_inc() for every operation. + */ +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_stat_inc(which) this_cpu_inc((which)) +#else +#define rseq_stat_inc(which) raw_cpu_inc((which)) +#endif + +#else /* CONFIG_RSEQ_STATS */ +#define rseq_stat_inc(x) do { } while (0) +#endif /* !CONFIG_RSEQ_STATS */ + +#ifdef CONFIG_RSEQ +#include <linux/jump_label.h> +#include <linux/rseq.h> +#include <linux/uaccess.h> + +#include <linux/tracepoint-defs.h> + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(rseq_update); +DECLARE_TRACEPOINT(rseq_ip_fixup); +void __rseq_trace_update(struct task_struct *t); +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip); + +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) +{ + if (tracepoint_enabled(rseq_update) && ids) + __rseq_trace_update(t); +} + +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + if (tracepoint_enabled(rseq_ip_fixup)) + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); +} + +#else /* CONFIG_TRACEPOINT */ +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) { } +#endif /* !CONFIG_TRACEPOINT */ + +DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_inline +#else +#define rseq_inline __always_inline +#endif + +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); + +static __always_inline void rseq_note_user_irq_entry(void) +{ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) + current->rseq.event.user_irq = true; +} + +/* + * Check whether there is a valid critical section and whether the + * instruction pointer in @regs is inside the critical section. + * + * - If the critical section is invalid, terminate the task. + * + * - If valid and the instruction pointer is inside, set it to the abort IP. + * + * - If valid and the instruction pointer is outside, clear the critical + * section address. + * + * Returns true, if the section was valid and either fixup or clear was + * done, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when a invalid + * section was found. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ + +#ifdef RSEQ_BUILD_SLOW_PATH +/* + * The debug version is put out of line, but kept here so the code stays + * together. + * + * @csaddr has already been checked by the caller to be in user space + */ +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, + unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; + unsigned long ip = instruction_pointer(regs); + u64 __user *uc_head = (u64 __user *) ucs; + u32 usig, __user *uc_sig; + + scoped_user_rw_access(ucs, efault) { + /* + * Evaluate the user pile and exit if one of the conditions + * is not fulfilled. + */ + unsafe_get_user(start_ip, &ucs->start_ip, efault); + if (unlikely(start_ip >= tasksize)) + goto die; + /* If outside, just clear the critical section. */ + if (ip < start_ip) + goto clear; + + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + cs_end = start_ip + offset; + /* Check for overflow and wraparound */ + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) + goto die; + + /* If not inside, clear it. */ + if (ip >= cs_end) + goto clear; + + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + /* Ensure it's "valid" */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + /* Validate that the abort IP is not in the critical section */ + if (unlikely(abort_ip - start_ip < offset)) + goto die; + + /* + * Check version and flags for 0. No point in emitting + * deprecated warnings before dying. That could be done in + * the slow path eventually, but *shrug*. + */ + unsafe_get_user(head, uc_head, efault); + if (unlikely(head)) + goto die; + + /* abort_ip - 4 is >= 0. See abort_ip check above */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* If not in interrupt from user context, let it die */ + if (unlikely(!t->rseq.event.user_irq)) + goto die; + } + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +#endif /* RSEQ_BUILD_SLOW_PATH */ + +/* + * This only ensures that abort_ip is in the user address space and + * validates that it is preceded by the signature. + * + * No other sanity checks are done here, that's what the debug code is for. + */ +static rseq_inline bool +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + unsigned long ip = instruction_pointer(regs); + unsigned long tasksize = TASK_SIZE; + u64 start_ip, abort_ip, offset; + u32 usig, __user *uc_sig; + + rseq_stat_inc(rseq_stats.cs); + + if (unlikely(csaddr >= tasksize)) { + t->rseq.event.fatal = true; + return false; + } + + if (static_branch_unlikely(&rseq_debug_enabled)) + return rseq_debug_update_user_cs(t, regs, csaddr); + + scoped_user_rw_access(ucs, efault) { + unsafe_get_user(start_ip, &ucs->start_ip, efault); + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + + /* + * No sanity checks. If user space screwed it up, it can + * keep the pieces. That's what debug code is for. + * + * If outside, just clear the critical section. + */ + if (ip - start_ip >= offset) + goto clear; + + /* + * Two requirements for @abort_ip: + * - Must be in user space as x86 IRET would happily return to + * the kernel. + * - The four bytes preceding the instruction at @abort_ip must + * contain the signature. + * + * The latter protects against the following attack vector: + * + * An attacker with limited abilities to write, creates a critical + * section descriptor, sets the abort IP to a library function or + * some other ROP gadget and stores the address of the descriptor + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP + * protection. + */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* Invalidate the critical section */ + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + /* Update the instruction pointer */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + +/* + * Update user space with new IDs and conditionally check whether the task + * is in a critical section. + */ +static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, + struct rseq_ids *ids, u32 node_id) +{ + u64 csaddr; + + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + return false; + + /* + * On architectures which utilize the generic entry code this + * allows to skip the critical section when the entry was not from + * a user space interrupt, unless debug mode is enabled. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + if (!static_branch_unlikely(&rseq_debug_enabled)) { + if (likely(!t->rseq.event.user_irq)) + return true; + } + } + if (likely(!csaddr)) + return true; + /* Sigh, this really needs to do work */ + return rseq_update_user_cs(t, regs, csaddr); +} + +/* + * If you want to use this then convert your architecture to the generic + * entry code. I'm tired of building workarounds for people who can't be + * bothered to make the maintenance of generic infrastructure less + * burdensome. Just sucking everything into the architecture code and + * thereby making others chase the horrible hacks and keep them working is + * neither acceptable nor sustainable. + */ +#ifdef CONFIG_GENERIC_ENTRY + +/* + * This is inlined into the exit path because: + * + * 1) It's a one time comparison in the fast path when there is no event to + * handle + * + * 2) The access to the user space rseq memory (TLS) is unlikely to fault + * so the straight inline operation is: + * + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated + * - One 64-bit load to retrieve the critical section address + * + * 3) In the unlikely case that the critical section address is != NULL: + * + * - One 64-bit load to retrieve the start IP + * - One 64-bit load to retrieve the offset for calculating the end + * - One 64-bit load to retrieve the abort IP + * - One 64-bit load to retrieve the signature + * - One store to clear the critical section address + * + * The non-debug case implements only the minimal required checking. It + * provides protection against a rogue abort IP in kernel space, which + * would be exploitable at least on x86, and also against a rogue CS + * descriptor by checking the signature at the abort IP. Any fallout from + * invalid critical section descriptors is a user space problem. The debug + * case provides the full set of checks and terminates the task if a + * condition is not met. + * + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq + * slow path there will handle the failure. + */ +static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) +{ + /* + * Page faults need to be disabled as this is called with + * interrupts disabled + */ + guard(pagefault)(); + if (likely(!t->rseq.event.ids_changed)) { + struct rseq __user *rseq = t->rseq.usrptr; + /* + * If IDs have not changed rseq_event::user_irq must be true + * See rseq_sched_switch_event(). + */ + u64 csaddr; + + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) + return false; + + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) + return false; + } + return true; + } + + struct rseq_ids ids = { + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + }; + u32 node_id = cpu_to_node(ids.cpu_id); + + return rseq_update_usr(t, regs, &ids, node_id); +} + +static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + struct task_struct *t = current; + + /* + * If the task did not go through schedule or got the flag enforced + * by the rseq syscall or execve, then nothing to do here. + * + * CPU ID and MM CID can only change when going through a context + * switch. + * + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit + * only when rseq_event::has_rseq is true. That conditional is + * required to avoid setting the TIF bit if RSEQ is not registered + * for a task. rseq_event::sched_switch is cleared when RSEQ is + * unregistered by a task so it's sufficient to check for the + * sched_switch bit alone. + * + * A sane compiler requires three instructions for the nothing to do + * case including clearing the events, but your mileage might vary. + */ + if (unlikely((t->rseq.event.sched_switch))) { + rseq_stat_inc(rseq_stats.fastpath); + + if (unlikely(!rseq_exit_user_update(regs, t))) + return true; + } + /* Clear state so next entry starts from a clean slate */ + t->rseq.event.events = 0; + return false; +} + +/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +static __always_inline bool test_tif_rseq(unsigned long ti_work) +{ + return ti_work & _TIF_RSEQ; +} + +static __always_inline void clear_tif_rseq(void) +{ + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); + clear_thread_flag(TIF_RSEQ); +} +#else +static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } +static __always_inline void clear_tif_rseq(void) { } +#endif + +static __always_inline bool +rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + if (likely(!test_tif_rseq(ti_work))) + return false; + + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + + clear_tif_rseq(); + return false; +} + +#else /* CONFIG_GENERIC_ENTRY */ +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} +#endif /* !CONFIG_GENERIC_ENTRY */ + +static __always_inline void rseq_syscall_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + /* Needed to remove the store for the !lockdep case */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + WARN_ON_ONCE(ev->sched_switch); + ev->events = 0; + } +} + +static __always_inline void rseq_irqentry_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + lockdep_assert_once(!ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing could not clear it. + */ + ev->events = 0; +} + +/* Required to keep ARM64 working */ +static __always_inline void rseq_exit_to_user_mode_legacy(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + if (static_branch_unlikely(&rseq_debug_enabled)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; +} + +void __rseq_debug_syscall_return(struct pt_regs *regs); + +static inline void rseq_debug_syscall_return(struct pt_regs *regs) +{ + if (static_branch_unlikely(&rseq_debug_enabled)) + __rseq_debug_syscall_return(regs); +} +#else /* CONFIG_RSEQ */ +static inline void rseq_note_user_irq_entry(void) { } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} +static inline void rseq_syscall_exit_to_user_mode(void) { } +static inline void rseq_irqentry_exit_to_user_mode(void) { } +static inline void rseq_exit_to_user_mode_legacy(void) { } +static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } +#endif /* !CONFIG_RSEQ */ + +#endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h new file mode 100644 index 000000000000..332dc14b81c9 --- /dev/null +++ b/include/linux/rseq_types.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_TYPES_H +#define _LINUX_RSEQ_TYPES_H + +#include <linux/irq_work_types.h> +#include <linux/types.h> +#include <linux/workqueue_types.h> + +#ifdef CONFIG_RSEQ +struct rseq; + +/** + * struct rseq_event - Storage for rseq related event management + * @all: Compound to initialize and clear the data efficiently + * @events: Compound to access events with a single load/store + * @sched_switch: True if the task was scheduled and needs update on + * exit to user + * @ids_changed: Indicator that IDs need to be updated + * @user_irq: True on interrupt entry from user mode + * @has_rseq: True if the task has a rseq pointer installed + * @error: Compound error code for the slow path to analyze + * @fatal: User space data corrupted or invalid + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME + * is required + * + * @sched_switch and @ids_changed must be adjacent and the combo must be + * 16bit aligned to allow a single store, when both are set at the same + * time in the scheduler. + */ +struct rseq_event { + union { + u64 all; + struct { + union { + u32 events; + struct { + u8 sched_switch; + u8 ids_changed; + u8 user_irq; + }; + }; + + u8 has_rseq; + u8 __pad; + union { + u16 error; + struct { + u8 fatal; + u8 slowpath; + }; + }; + }; + }; +}; + +/** + * struct rseq_ids - Cache for ids, which need to be updated + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the + * compiler emit a single compare on 64-bit + * @cpu_id: The CPU ID which was written last to user space + * @mm_cid: The MM CID which was written last to user space + * + * @cpu_id and @mm_cid are updated when the data is written to user space. + */ +struct rseq_ids { + union { + u64 cpu_cid; + struct { + u32 cpu_id; + u32 mm_cid; + }; + }; +}; + +/** + * struct rseq_data - Storage for all rseq related data + * @usrptr: Pointer to the registered user space RSEQ memory + * @len: Length of the RSEQ region + * @sig: Signature of critial section abort IPs + * @event: Storage for event management + * @ids: Storage for cached CPU ID and MM CID + */ +struct rseq_data { + struct rseq __user *usrptr; + u32 len; + u32 sig; + struct rseq_event event; + struct rseq_ids ids; +}; + +#else /* CONFIG_RSEQ */ +struct rseq_data { }; +#endif /* !CONFIG_RSEQ */ + +#ifdef CONFIG_SCHED_MM_CID + +#define MM_CID_UNSET BIT(31) +#define MM_CID_ONCPU BIT(30) +#define MM_CID_TRANSIT BIT(29) + +/** + * struct sched_mm_cid - Storage for per task MM CID data + * @active: MM CID is active for the task + * @cid: The CID associated to the task either permanently or + * borrowed from the CPU + */ +struct sched_mm_cid { + unsigned int active; + unsigned int cid; +}; + +/** + * struct mm_cid_pcpu - Storage for per CPU MM_CID data + * @cid: The CID associated to the CPU either permanently or + * while a task with a CID is running + */ +struct mm_cid_pcpu { + unsigned int cid; +}____cacheline_aligned_in_smp; + +/** + * struct mm_mm_cid - Storage for per MM CID data + * @pcpu: Per CPU storage for CIDs associated to a CPU + * @percpu: Set, when CIDs are in per CPU mode + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase + * @max_cids: The exclusive maximum CID value for allocation and convergence + * @irq_work: irq_work to handle the affinity mode change case + * @work: Regular work to handle the affinity mode change case + * @lock: Spinlock to protect against affinity setting which can't take @mutex + * @mutex: Mutex to serialize forks and exits related to this mm + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map + * is growth only. + * @users: The number of tasks sharing this MM. Separate from mm::mm_users + * as that is modified by mmget()/mm_put() by other entities which + * do not actually share the MM. + * @pcpu_thrs: Threshold for switching back from per CPU mode + * @update_deferred: A deferred switch back to per task mode is pending. + */ +struct mm_mm_cid { + /* Hotpath read mostly members */ + struct mm_cid_pcpu __percpu *pcpu; + unsigned int percpu; + unsigned int transit; + unsigned int max_cids; + + /* Rarely used. Moves @lock and @mutex into the second cacheline */ + struct irq_work irq_work; + struct work_struct work; + + raw_spinlock_t lock; + struct mutex mutex; + + /* Low frequency modified */ + unsigned int nr_cpus_allowed; + unsigned int users; + unsigned int pcpu_thrs; + unsigned int update_deferred; +}____cacheline_aligned_in_smp; +#else /* CONFIG_SCHED_MM_CID */ +struct mm_mm_cid { }; +struct sched_mm_cid { }; +#endif /* !CONFIG_SCHED_MM_CID */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..fac12bb7dbe4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,7 +41,7 @@ #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> -#include <uapi/linux/rseq.h> +#include <linux/rseq_types.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> @@ -637,8 +637,8 @@ struct sched_rt_entity { #endif } __randomize_layout; -typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); -typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; @@ -685,20 +685,22 @@ struct sched_dl_entity { * * @dl_server tells if this is a server entity. * - * @dl_defer tells if this is a deferred or regular server. For - * now only defer server exists. - * - * @dl_defer_armed tells if the deferrable server is waiting - * for the replenishment timer to activate it. - * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -709,6 +711,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -730,9 +733,6 @@ struct sched_dl_entity { * dl_server_update(). * * @rq the runqueue this server is for - * - * @server_has_tasks() returns true if @server_pick return a - * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; @@ -1406,33 +1406,8 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_mask must be performed atomically - * with respect to preemption. - */ - unsigned long rseq_event_mask; -# ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -# endif -#endif - -#ifdef CONFIG_SCHED_MM_CID - int mm_cid; /* Current cid in mm */ - int last_mm_cid; /* Most recent cid in mm */ - int migrate_from_cpu; - int mm_cid_active; /* Whether cid bitmap is active */ - struct callback_head cid_work; -#endif + struct rseq_data rseq; + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; @@ -1861,8 +1836,8 @@ extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task @@ -1901,6 +1876,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); @@ -2058,6 +2034,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline void set_need_resched_current(void) +{ + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2318,6 +2301,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); +static __always_inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } +static __always_inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); +} +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index b7fafe999073..624fda17a785 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,7 +8,7 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bbcfdf12aa6e..45c0022b91ce 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -92,6 +92,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5ce48eab7a2a..a8a8661839b6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1209,4 +1209,118 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } + +enum ss_state { + ss_done = 0, + ss_lock, + ss_lock_irqsave, + ss_lockless, +}; + +struct ss_tmp { + enum ss_state state; + unsigned long data; + spinlock_t *lock; + spinlock_t *lock_irqsave; +}; + +static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) +{ + if (sst->lock) + spin_unlock(sst->lock); + if (sst->lock_irqsave) + spin_unlock_irqrestore(sst->lock_irqsave, sst->data); +} + +extern void __scoped_seqlock_invalid_target(void); + +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) +/* + * For some reason some GCC-8 architectures (nios2, alpha) have trouble + * determining that the ss_done state is impossible in __scoped_seqlock_next() + * below. + * + * Similarly KASAN is known to confuse compilers enough to break this. But we + * don't care about code quality for KASAN builds anyway. + */ +static inline void __scoped_seqlock_bug(void) { } +#else +/* + * Canary for compiler optimization -- if the compiler doesn't realize this is + * an impossible state, it very likely generates sub-optimal code here. + */ +extern void __scoped_seqlock_bug(void); +#endif + +static inline void +__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) +{ + switch (sst->state) { + case ss_done: + __scoped_seqlock_bug(); + return; + + case ss_lock: + case ss_lock_irqsave: + sst->state = ss_done; + return; + + case ss_lockless: + if (!read_seqretry(lock, sst->data)) { + sst->state = ss_done; + return; + } + break; + } + + switch (target) { + case ss_done: + __scoped_seqlock_invalid_target(); + return; + + case ss_lock: + sst->lock = &lock->lock; + spin_lock(sst->lock); + sst->state = ss_lock; + return; + + case ss_lock_irqsave: + sst->lock_irqsave = &lock->lock; + spin_lock_irqsave(sst->lock_irqsave, sst->data); + sst->state = ss_lock_irqsave; + return; + + case ss_lockless: + sst->data = read_seqbegin(lock); + return; + } +} + +#define __scoped_seqlock_read(_seqlock, _target, _s) \ + for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \ + { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \ + _s.state != ss_done; \ + __scoped_seqlock_next(&_s, _seqlock, _target)) + +/** + * scoped_seqlock_read (lock, ss_state) - execute the read side critical + * section without manual sequence + * counter handling or calls to other + * helpers + * @lock: pointer to seqlock_t protecting the data + * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating + * the type of critical read section + * + * Example: + * + * scoped_seqlock_read (&lock, ss_lock) { + * // read-side critical section + * } + * + * Starts with a lockess pass first. If it fails, restarts the critical + * section with the lock held. + */ +#define scoped_seqlock_read(_seqlock, _target) \ + __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock)) + #endif /* __LINUX_SEQLOCK_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..774efe592a9a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list); -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 66c06fcdfe19..cf84d98964b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct ns_id_req; struct xattr_args; struct file_attr; @@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req, asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); +asmlinkage long sys_listns(const struct ns_id_req __user *req, + u64 __user *ns_ids, size_t nr_ns_ids, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dd925d84fa46..b40de9bab4b7 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -67,6 +67,11 @@ enum syscall_work_bit { #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif +#ifndef TIF_RSEQ +# define TIF_RSEQ TIF_NOTIFY_RESUME +# define _TIF_RSEQ _TIF_NOTIFY_RESUME +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data diff --git a/include/linux/timer.h b/include/linux/timer.h index 0414d9e6b4fc..62e1cea71125 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu); #define timers_dead_cpu NULL #endif +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask); +#else +static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + return 0; +} +#endif + #endif diff --git a/include/linux/types.h b/include/linux/types.h index 6dfdb8e8e4c3..d4437e9c452c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -50,6 +50,7 @@ typedef __kernel_old_gid_t old_gid_t; #if defined(__GNUC__) typedef __kernel_loff_t loff_t; +typedef __kernel_uoff_t uoff_t; #endif /* diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..be395f5f7ee3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include <linux/cleanup.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> @@ -35,9 +36,17 @@ #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 +# ifndef masked_user_write_access_begin +# define masked_user_write_access_begin masked_user_access_begin +# endif +# ifndef masked_user_read_access_begin +# define masked_user_read_access_begin masked_user_access_begin +#endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define masked_user_read_access_begin(src) NULL + #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif @@ -518,7 +527,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); -#ifndef __get_kernel_nofault +#ifdef arch_get_kernel_nofault +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + */ +#define __get_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_get_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_put_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ + #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ @@ -535,7 +571,8 @@ do { \ if (__put_user(data, p)) \ goto label; \ } while (0) -#endif + +#endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location @@ -549,7 +586,42 @@ do { \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) -#ifndef user_access_begin +#ifdef user_access_begin + +#ifdef arch_unsafe_get_user +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + * + * Some architectures use internal local labels already, but this extra + * indirection here is harmless because the compiler optimizes it out + * completely in any case. This construct just ensures that the ASM GOTO + * target is always in the local scope. The C goto 'label' works correctly + * when leaving a cleanup() scope. + */ +#define unsafe_get_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_get_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define unsafe_put_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_put_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) +#endif /* arch_unsafe_get_user */ + +#else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) @@ -559,7 +631,8 @@ do { \ #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } -#endif +#endif /* !user_access_begin */ + #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end @@ -569,6 +642,239 @@ static inline void user_access_restore(unsigned long flags) { } #define user_read_access_end user_access_end #endif +/* Define RW variant so the below _mode macro expansion works */ +#define masked_user_rw_access_begin(u) masked_user_access_begin(u) +#define user_rw_access_begin(u, s) user_access_begin(u, s) +#define user_rw_access_end() user_access_end() + +/* Scoped user access */ +#define USER_ACCESS_GUARD(_mode) \ +static __always_inline void __user * \ +class_user_##_mode##_begin(void __user *ptr) \ +{ \ + return ptr; \ +} \ + \ +static __always_inline void \ +class_user_##_mode##_end(void __user *ptr) \ +{ \ + user_##_mode##_access_end(); \ +} \ + \ +DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ + class_user_##_mode##_end(_T), \ + class_user_##_mode##_begin(ptr), void __user *ptr) \ + \ +static __always_inline class_user_##_mode##_access_t \ +class_user_##_mode##_access_ptr(void __user *scope) \ +{ \ + return scope; \ +} + +USER_ACCESS_GUARD(read) +USER_ACCESS_GUARD(write) +USER_ACCESS_GUARD(rw) +#undef USER_ACCESS_GUARD + +/** + * __scoped_user_access_begin - Start a scoped user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected + * + * Internal helper for __scoped_user_access(). Don't use directly. + */ +#define __scoped_user_access_begin(mode, uptr, size, elbl) \ +({ \ + typeof(uptr) __retptr; \ + \ + if (can_do_masked_user_access()) { \ + __retptr = masked_user_##mode##_access_begin(uptr); \ + } else { \ + __retptr = uptr; \ + if (!user_##mode##_access_begin(uptr, size)) \ + goto elbl; \ + } \ + __retptr; \ +}) + +/** + * __scoped_user_access - Open a scope for user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected. It + * must be placed outside the scope + * + * If the user access function inside the scope requires a fault label, it + * can use @elbl or a different label outside the scope, which requires + * that user access which is implemented with ASM GOTO has been properly + * wrapped. See unsafe_get_user() for reference. + * + * scoped_user_rw_access(ptr, efault) { + * unsafe_get_user(rval, &ptr->rval, efault); + * unsafe_put_user(wval, &ptr->wval, efault); + * } + * return 0; + * efault: + * return -EFAULT; + * + * The scope is internally implemented as a autoterminating nested for() + * loop, which can be left with 'return', 'break' and 'goto' at any + * point. + * + * When the scope is left user_##@_mode##_access_end() is automatically + * invoked. + * + * When the architecture supports masked user access and the access region + * which is determined by @uptr and @size is not a valid user space + * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user + * space address and does not terminate early. This optimizes for the good + * case and lets the performance uncritical bad case go through the fault. + * + * The eventual modification of the pointer is limited to the scope. + * Outside of the scope the original pointer value is unmodified, so that + * the original pointer value is available for diagnostic purposes in an + * out of scope fault path. + * + * Nesting scoped user access into a user access scope is invalid and fails + * the build. Nesting into other guards, e.g. pagefault is safe. + * + * The masked variant does not check the size of the access and relies on a + * mapping hole (e.g. guard page) to catch an out of range pointer, the + * first access to user memory inside the scope has to be within + * @uptr ... @uptr + PAGE_SIZE - 1 + * + * Don't use directly. Use scoped_masked_user_$MODE_access() instead. + */ +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const typeof(uptr) uptr = _tmpptr; !done; done = true) + +/** + * scoped_user_read_access_size - Start a scoped user read access with given size + * @usrc: Pointer to the user space address to read from + * @size: Size of the access starting from @usrc + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access_size(usrc, size, elbl) \ + __scoped_user_access(read, usrc, size, elbl) + +/** + * scoped_user_read_access - Start a scoped user read access + * @usrc: Pointer to the user space address to read from + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @usrc is determined via sizeof(*@usrc)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access(usrc, elbl) \ + scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) + +/** + * scoped_user_write_access_size - Start a scoped user write access with given size + * @udst: Pointer to the user space address to write to + * @size: Size of the access starting from @udst + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access_size(udst, size, elbl) \ + __scoped_user_access(write, udst, size, elbl) + +/** + * scoped_user_write_access - Start a scoped user write access + * @udst: Pointer to the user space address to write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @udst is determined via sizeof(*@udst)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access(udst, elbl) \ + scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) + +/** + * scoped_user_rw_access_size - Start a scoped user read/write access with given size + * @uptr Pointer to the user space address to read from and write to + * @size: Size of the access starting from @uptr + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access_size(uptr, size, elbl) \ + __scoped_user_access(rw, uptr, size, elbl) + +/** + * scoped_user_rw_access - Start a scoped user read/write access + * @uptr Pointer to the user space address to read from and write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @uptr is determined via sizeof(*@uptr)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access(uptr, elbl) \ + scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) + +/** + * get_user_inline - Read user data inlined + * @val: The variable to store the value read from user memory + * @usrc: Pointer to the user space memory to read from + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of get_user(). Only use when there is a demonstrable + * performance reason. + */ +#define get_user_inline(val, usrc) \ +({ \ + __label__ efault; \ + typeof(usrc) _tmpsrc = usrc; \ + int _ret = 0; \ + \ + scoped_user_read_access(_tmpsrc, efault) \ + unsafe_get_user(val, _tmpsrc, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + +/** + * put_user_inline - Write to user memory inlined + * @val: The value to write + * @udst: Pointer to the user space memory to write to + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of put_user(). Only use when there is a demonstrable + * performance reason. + */ +#define put_user_inline(val, udst) \ +({ \ + __label__ efault; \ + typeof(udst) _tmpdst = udst; \ + int _ret = 0; \ + \ + scoped_user_write_access(_tmpdst, efault) \ + unsafe_put_user(val, _tmpdst, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 26122d00708a..bc7ae7d21900 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -6,16 +6,6 @@ #include <linux/unwind_user.h> #include <linux/unwind_deferred_types.h> -struct unwind_work; - -typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie); - -struct unwind_work { - struct list_head list; - unwind_callback_t func; - int bit; -}; - #ifdef CONFIG_UNWIND_USER enum { @@ -44,22 +34,22 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits; + unsigned long bits = atomic_long_read(&info->unwind_mask); /* Was there any unwinding? */ - if (unlikely(info->unwind_mask)) { - bits = info->unwind_mask; - do { - /* Is a task_work going to run again before going back */ - if (bits & UNWIND_PENDING) - return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); - current->unwind_info.id.id = 0; - - if (unlikely(info->cache)) { - info->cache->nr_entries = 0; - info->cache->unwind_completed = 0; - } + if (likely(!bits)) + return; + + do { + /* Is a task_work going to run again before going back */ + if (bits & UNWIND_PENDING) + return; + } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + current->unwind_info.id.id = 0; + + if (unlikely(info->cache)) { + info->cache->nr_entries = 0; + info->cache->unwind_completed = 0; } } @@ -68,9 +58,17 @@ static __always_inline void unwind_reset_info(void) static inline void unwind_task_init(struct task_struct *task) {} static inline void unwind_task_free(struct task_struct *task) {} -static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; } -static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; } -static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline int unwind_user_faultable(struct unwind_stacktrace *trace) +{ return -ENOSYS; } + +static inline int +unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ return -ENOSYS; } + +static inline int +unwind_deferred_request(struct unwind_work *work, u64 *timestamp) +{ return -ENOSYS; } + static inline void unwind_deferred_cancel(struct unwind_work *work) {} static inline void unwind_deferred_task_exit(struct task_struct *task) {} diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 33b62ac25c86..18fa3932f61c 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -2,6 +2,9 @@ #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H +#include <linux/types.h> +#include <linux/atomic.h> + struct unwind_cache { unsigned long unwind_completed; unsigned int nr_entries; @@ -30,10 +33,23 @@ union unwind_task_id { }; struct unwind_task_info { - unsigned long unwind_mask; + atomic_long_t unwind_mask; struct unwind_cache *cache; struct callback_head work; union unwind_task_id id; }; +struct unwind_work; +struct unwind_stacktrace; + +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); + +struct unwind_work { + struct list_head list; + unwind_callback_t func; + int bit; +}; + #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index a449f15be890..412729a269bc 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -36,8 +36,10 @@ struct unwind_user_state { unsigned long ip; unsigned long sp; unsigned long fp; + unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; + bool topmost; bool done; }; diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 3aaf19e77558..8285b19a25e0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -376,6 +376,9 @@ struct usb_gadget_ops { * can handle. The UDC must support this and all slower speeds and lower * number of lanes. * @state: the state we are now (attached, suspended, configured, etc) + * @state_lock: Spinlock protecting the `state` and `teardown` members. + * @teardown: True if the device is undergoing teardown, used to prevent + * new work from being scheduled during cleanup. * @name: Identifies the controller hardware type. Used in diagnostics * and sometimes configuration. * @dev: Driver model state for this abstract device. @@ -451,6 +454,8 @@ struct usb_gadget { enum usb_ssp_rate max_ssp_rate; enum usb_device_state state; + spinlock_t state_lock; + bool teardown; const char *name; struct device dev; unsigned isoch_delay; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9a9aebbf96b9..9c3be157397e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } -#ifdef CONFIG_USER_NS - static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } +#ifdef CONFIG_USER_NS + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 4d1780848d0e..75dabb763c65 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -384,7 +384,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool little_endian, - int vlan_hlen) + int vlan_hlen, + bool has_data_valid) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; @@ -394,14 +395,15 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); if (!tnl_gso_type) - return virtio_net_hdr_from_skb(skb, hdr, little_endian, false, - vlan_hlen); + return virtio_net_hdr_from_skb(skb, hdr, little_endian, + has_data_valid, vlan_hlen); /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) return -EINVAL; - vhdr->hash_hdr.hash_value = 0; + vhdr->hash_hdr.hash_value_lo = 0; + vhdr->hash_hdr.hash_value_hi = 0; vhdr->hash_hdr.hash_report = 0; vhdr->hash_hdr.padding = 0; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..f48e8ccffe81 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,11 +189,11 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); -/* writeback.h requires fs.h; it, too, is not included from here. */ -static inline void wait_on_inode(struct inode *inode) +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) { - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; } #ifdef CONFIG_CGROUP_WRITEBACK @@ -234,7 +234,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { - WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 86b0d47984a1..64e9afe7d647 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, - struct inode **); + struct delegated_inode *); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, - const char *, struct inode **); + const char *, struct delegated_inode *); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d0e703bc929..cb4c02d00759 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2783,6 +2783,11 @@ struct hci_ev_le_per_adv_report { __u8 data[]; } __packed; +#define HCI_EV_LE_PA_SYNC_LOST 0x10 +struct hci_ev_le_pa_sync_lost { + __le16 handle; +} __packed; + #define LE_PA_DATA_COMPLETE 0x00 #define LE_PA_DATA_MORE_TO_COME 0x01 #define LE_PA_DATA_TRUNCATED 0x02 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b8100dbfe5d7..0cb87687837f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -749,7 +749,6 @@ struct hci_conn { __u8 remote_cap; __u8 remote_auth; - __u8 remote_id; unsigned int sent; @@ -857,11 +856,12 @@ extern struct mutex hci_cb_list_lock; /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) @@ -869,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb) { + kfree_skb(skb); + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } -static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, - u16 flags) + +static inline int iso_recv(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + kfree_skb(skb); + return -ENOENT; } #endif diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index bca0333f1e99..f5be96f08b9d 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -780,7 +780,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 781624f5913a..820e299f06b5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6435,6 +6435,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can * use just cancel_work() instead of cancel_work_sync(), it requires * being in a section protected by wiphy_lock(). + * + * Note that these are scheduled with a timer where the accuracy + * becomes less the longer in the future the scheduled timer is. Use + * wiphy_hrtimer_work_queue() if the timer must be not be late by more + * than approximately 10 percent. */ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, @@ -6506,6 +6511,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork); +struct wiphy_hrtimer_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct hrtimer timer; +}; + +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t); + +static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork, + wiphy_work_func_t func) +{ + hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer, + CLOCK_BOOTTIME, HRTIMER_MODE_REL); + wiphy_work_init(&hrwork->work, func); +} + +/** + * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy + * @wiphy: the wiphy to queue for + * @hrwork: the high resolution timer worker + * @delay: the delay given as a ktime_t + * + * Please refer to wiphy_delayed_work_queue(). The difference is that + * the hrtimer work uses a high resolution timer for scheduling. This + * may be needed if timeouts might be scheduled further in the future + * and the accuracy of the normal timer is not sufficient. + * + * Expect a delay of a few milliseconds as the timer is scheduled + * with some slack and some more time may pass between queueing the + * work and its start. + */ +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay); + +/** + * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrtimer: the hrtimer work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrtimer); + +/** + * wiphy_hrtimer_work_flush - flush previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work in question + * + * Return: true if timer is pending, false otherwise + * + * Please refer to the wiphy_delayed_work_pending() documentation as + * this is the equivalent function for hrtimer based delayed work + * items. + */ +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + /** * enum ieee80211_ap_reg_power - regulatory power for an Access Point * diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index bc3507edd589..898723ab62e8 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -513,7 +513,7 @@ struct libeth_xdp_tx_desc { * can't fail, but can send less frames if there's no enough free descriptors * available. The actual free space is returned by @prep from the driver. */ -static __always_inline u32 +static __always_inline __nocfi_generic u32 libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, u32 n, bool unroll, u64 priv, u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c64fd896b1f9..99ac747b7906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: + if (!skb_transport_header_was_set(skb)) + break; return skb_transport_header(skb); } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f3014e4f54fc..0a14daaa5dd4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h index 823b47d1ba1e..ce85d650bf4b 100644 --- a/include/trace/events/rseq.h +++ b/include/trace/events/rseq.h @@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update, ), TP_fast_assign( - __entry->cpu_id = raw_smp_processor_id(); + __entry->cpu_id = t->rseq.ids.cpu_id; __entry->node_id = cpu_to_node(__entry->cpu_id); - __entry->mm_cid = task_mm_cid(t); + __entry->mm_cid = t->rseq.ids.mm_cid; ), TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h index 47db5eaf2f9a..61171b13c687 100644 --- a/include/trace/events/timer_migration.h +++ b/include/trace/events/timer_migration.h @@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active, TP_ARGS(tmc) ); -DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online, +DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available, TP_PROTO(struct tmigr_cpu *tmc), TP_ARGS(tmc) ); -DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline, +DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable, TP_PROTO(struct tmigr_cpu *tmc), diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c08aff044e80..311a341e6fe4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->flags = flags; ), @@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), @@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; @@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index b5f7594eee7a..0a90ad92dbf3 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -86,6 +86,7 @@ typedef struct { */ typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; +typedef unsigned long long __kernel_uoff_t; typedef __kernel_long_t __kernel_old_time_t; #ifndef __KERNEL__ typedef __kernel_long_t __kernel_time_t; diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 04e0077fb4c9..942370b3f5d2 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) __SYSCALL(__NR_file_getattr, sys_file_getattr) #define __NR_file_setattr 469 __SYSCALL(__NR_file_setattr, sys_file_setattr) +#define __NR_listns 470 +__SYSCALL(__NR_listns, sys_listns) #undef __NR_syscalls -#define __NR_syscalls 470 +#define __NR_syscalls 471 /* * 32 bit systems traditionally used different diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index ea91aa8afde9..e527b24bd824 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -979,14 +979,20 @@ extern "C" { * 2 = Gob Height 8, Turing+ Page Kind mapping * 3 = Reserved for future use. * - * 22:22 s Sector layout. On Tegra GPUs prior to Xavier, there is a further - * bit remapping step that occurs at an even lower level than the - * page kind and block linear swizzles. This causes the layout of - * surfaces mapped in those SOC's GPUs to be incompatible with the - * equivalent mapping on other GPUs in the same system. - * - * 0 = Tegra K1 - Tegra Parker/TX2 Layout. - * 1 = Desktop GPU and Tegra Xavier+ Layout + * 22:22 s Sector layout. There is a further bit remapping step that occurs + * 26:27 at an even lower level than the page kind and block linear + * swizzles. This causes the bit arrangement of surfaces in memory + * to differ subtly, and prevents direct sharing of surfaces between + * GPUs with different layouts. + * + * 0 = Tegra K1 - Tegra Parker/TX2 Layout + * 1 = Pre-GB20x, GB20x 32+ bpp, GB10, Tegra Xavier-Orin Layout + * 2 = GB20x(Blackwell 2)+ 8 bpp surface layout + * 3 = GB20x(Blackwell 2)+ 16 bpp surface layout + * 4 = Reserved for future use. + * 5 = Reserved for future use. + * 6 = Reserved for future use. + * 7 = Reserved for future use. * * 25:23 c Lossless Framebuffer Compression type. * @@ -1001,7 +1007,7 @@ extern "C" { * 6 = Reserved for future use * 7 = Reserved for future use * - * 55:25 - Reserved for future use. Must be zero. + * 55:28 - Reserved for future use. Must be zero. */ #define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \ fourcc_mod_code(NVIDIA, (0x10 | \ @@ -1009,6 +1015,7 @@ extern "C" { (((k) & 0xff) << 12) | \ (((g) & 0x3) << 20) | \ (((s) & 0x1) << 22) | \ + (((s) & 0x6) << 25) | \ (((c) & 0x7) << 23))) /* To grandfather in prior block linear format modifiers to the above layout, diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 3741ea1b73d8..5e277fd955aa 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -4,6 +4,11 @@ #include <asm/fcntl.h> #include <linux/openat2.h> +#ifdef __KERNEL__ +#include <linux/types.h> +#else +#include <stdint.h> +#endif #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) @@ -79,6 +84,17 @@ */ #define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET +/* Set/Get delegations */ +#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15) +#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16) + +/* Argument structure for F_GETDELEG and F_SETDELEG */ +struct delegation { + uint32_t d_flags; /* Must be 0 */ + uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + uint16_t __pad; /* Must be 0 */ +}; + /* * Types of directory notifications that may be requested. */ diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4a9fbf42aa9f..30f3c9eaafaa 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,7 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ -#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ +#define INPUT_PROP_PRESSUREPAD 0x07 /* pressure triggers clicks */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) @@ -631,6 +631,18 @@ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ +/* + * Keycodes for hotkeys toggling the electronic privacy screen found on some + * laptops on/off. Note when the embedded-controller turns on/off the eprivacy + * screen itself then the state should be reported through drm connecter props: + * https://www.kernel.org/doc/html/latest/gpu/drm-kms.html#standard-connector-properties + * Except when implementing the drm connecter properties API is not possible + * because e.g. the firmware does not allow querying the presence and/or status + * of the eprivacy screen at boot. + */ +#define KEY_EPRIVACY_SCREEN_ON 0x252 +#define KEY_EPRIVACY_SCREEN_OFF 0x253 + #define KEY_KBDINPUTASSIST_PREV 0x260 #define KEY_KBDINPUTASSIST_NEXT 0x261 #define KEY_KBDINPUTASSIST_PREVGROUP 0x262 diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..b7c8dad26690 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,9 +689,6 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, - /* return zcrx buffers back into circulation */ - IORING_REGISTER_ZCRX_REFILL = 36, - /* this goes last */ IORING_REGISTER_LAST, @@ -1073,15 +1070,6 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; -struct io_uring_zcrx_sync_refill { - __u32 zcrx_id; - /* the number of entries to return */ - __u32 nr_entries; - /* pointer to an array of struct io_uring_zcrx_rqe */ - __u64 rqes; - __u64 __resv[2]; -}; - #ifdef __cplusplus } #endif diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 5d754322a27c..3539ccbfd064 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -36,6 +36,9 @@ struct io_uring_query_opcode { __u64 enter_flags; /* Bitmask of all supported IOSQE_* flags */ __u64 sqe_flags; + /* The number of available query opcodes */ + __u32 nr_query_opcodes; + __u32 __pad; }; #endif diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 8197a4800604..40aa545101a3 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -52,7 +52,7 @@ struct isst_if_cpu_map { /** * struct isst_if_cpu_maps - structure for CPU map IOCTL * @cmd_count: Number of CPU mapping command in cpu_map[] - * @cpu_map[]: Holds one or more CPU map data structure + * @cpu_map: Holds one or more CPU map data structure * * This structure used with ioctl ISST_IF_GET_PHY_ID to send * one or more CPU mapping commands. Here IOCTL return value indicates @@ -82,8 +82,8 @@ struct isst_if_io_reg { /** * struct isst_if_io_regs - structure for IO register commands - * @cmd_count: Number of io reg commands in io_reg[] - * @io_reg[]: Holds one or more io_reg command structure + * @req_count: Number of io reg commands in io_reg[] + * @io_reg: Holds one or more io_reg command structure * * This structure used with ioctl ISST_IF_IO_CMD to send * one or more read/write commands to PUNIT. Here IOCTL return value @@ -120,7 +120,7 @@ struct isst_if_mbox_cmd { /** * struct isst_if_mbox_cmds - structure for mailbox commands * @cmd_count: Number of mailbox commands in mbox_cmd[] - * @mbox_cmd[]: Holds one or more mbox commands + * @mbox_cmd: Holds one or more mbox commands * * This structure used with ioctl ISST_IF_MBOX_COMMAND to send * one or more mailbox commands to PUNIT. Here IOCTL return value @@ -152,7 +152,7 @@ struct isst_if_msr_cmd { /** * struct isst_if_msr_cmds - structure for msr commands * @cmd_count: Number of mailbox commands in msr_cmd[] - * @msr_cmd[]: Holds one or more msr commands + * @msr_cmd: Holds one or more msr commands * * This structure used with ioctl ISST_IF_MSR_COMMAND to send * one or more MSR commands. IOCTL return value indicates number of @@ -167,8 +167,9 @@ struct isst_if_msr_cmds { * struct isst_core_power - Structure to get/set core_power feature * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @enable: Feature enable status + * @supported: Power domain supports SST_CP interface * @priority_type: Priority type for the feature (ordered/proportional) * * Structure to get/set core_power feature state using IOCTL @@ -187,11 +188,11 @@ struct isst_core_power { * struct isst_clos_param - Structure to get/set clos praram * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id - * clos: Clos ID for the parameters - * min_freq_mhz: Minimum frequency in MHz - * max_freq_mhz: Maximum frequency in MHz - * prop_prio: Proportional priority from 0-15 + * @power_domain_id: Power Domain id + * @clos: Clos ID for the parameters + * @min_freq_mhz: Minimum frequency in MHz + * @max_freq_mhz: Maximum frequency in MHz + * @prop_prio: Proportional priority from 0-15 * * Structure to get/set per clos property using IOCTL * ISST_IF_CLOS_PARAM. @@ -209,7 +210,7 @@ struct isst_clos_param { /** * struct isst_if_clos_assoc - Structure to assign clos to a CPU * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @@ -228,6 +229,7 @@ struct isst_if_clos_assoc { * @get_set: Request is for get or set * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number + * @assoc_info: CLOS data for this CPU * * Structure used to get/set associate CPUs to clos using IOCTL * ISST_IF_CLOS_ASSOC. @@ -257,7 +259,7 @@ struct isst_tpmi_instance_count { /** * struct isst_perf_level_info - Structure to get information on SST-PP levels * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @max_level: Maximum performance level supported by the platform @@ -267,8 +269,8 @@ struct isst_tpmi_instance_count { * @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level * @locked: SST-PP performance level change is locked/unlocked * @enabled: SST-PP feature is enabled or not - * @sst-tf_support: SST-TF support status at this level - * @sst-bf_support: SST-BF support status at this level + * @sst_tf_support: SST-TF support status at this level + * @sst_bf_support: SST-BF support status at this level * * Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS. */ @@ -289,7 +291,7 @@ struct isst_perf_level_info { /** * struct isst_perf_level_control - Structure to set SST-PP level * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: level to set * * Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL. @@ -303,7 +305,7 @@ struct isst_perf_level_control { /** * struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @feature: bit 0 = SST-BF state, bit 1 = SST-TF state * * Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE. @@ -320,7 +322,7 @@ struct isst_perf_feature_control { /** * struct isst_perf_level_data_info - Structure to get SST-PP level details * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @tdp_ratio: TDP Ratio * @base_freq_mhz: Base frequency in MHz @@ -341,8 +343,8 @@ struct isst_perf_feature_control { * @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency * @max_buckets: Maximum trl buckets * @max_trl_levels: Maximum trl levels - * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket - * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency + * @bucket_core_counts: Number of cores per bucket + * @trl_freq_mhz: maximum frequency * for a bucket and trl level * * Structure used to get information on frequencies and TDP for a SST-PP @@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info { /** * struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number. If 0 CPU buffer is copied to user space @@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask { /** * struct isst_base_freq_info - Structure to get SST-BF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @high_base_freq_mhz: High priority CPU base frequency * @low_base_freq_mhz: Low priority CPU base frequency @@ -453,9 +455,11 @@ struct isst_base_freq_info { /** * struct isst_turbo_freq_info - Structure to get SST-TF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @max_clip_freqs: Maximum number of low priority core clipping frequencies + * @max_buckets: Maximum trl buckets + * @max_trl_levels: Maximum trl levels * @lp_clip_freq_mhz: Clip frequencies per trl level * @bucket_core_counts: Maximum number of cores for a bucket * @trl_freq_mhz: Frequencies per trl level for each bucket diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 7fa67c2031a5..5d3f8c9e3a62 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -197,7 +197,7 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 spare; + __u32 mnt_ns_fd; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index e098759ec917..a25e38d1c874 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -67,4 +67,62 @@ struct nsfs_file_handle { #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 957db425d459..ea9a6811fc76 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -26,8 +26,12 @@ #define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ #define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */ +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ +#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ +#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */ /* * Values for @coredump_mask in pidfd_info. @@ -91,8 +95,11 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct /* coredump info */ { + __u32 coredump_mask; + __u32 coredump_signal; + }; + __u64 supported_mask; /* Mask flags that this kernel supports */ }; #define PIDFS_IOCTL_MAGIC 0xFF diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index c233aae5eac9..1b76d508400c 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -114,20 +114,13 @@ struct rseq { /* * Restartable sequences flags field. * - * This field should only be updated by the thread which - * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. - * - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart on preemption - * for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart on signal - * delivery for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart on migration for - * this thread. + * This field was initially intended to allow event masking for + * single-stepping through rseq critical sections with debuggers. + * The kernel does not support this anymore and the relevant bits + * are checked for being always false: + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE */ __u32 flags; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 386ad36f1a0a..cab5cadca8ef 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -249,8 +249,9 @@ struct tee_ioctl_param { * @cancel_id: [in] Cancellation id, a unique value to identify this request * @session: [out] Session id * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_ioctl_open_session_arg { __u8 uuid[TEE_IOCTL_UUID_LEN]; @@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg { struct tee_ioctl_buf_data) /** - * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted - * Application + * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application * @func: [in] Trusted Application function, specific to the TA * @session: [in] Session id * @cancel_id: [in] Cancellation id, a unique value to identify this request * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_invoke_arg { __u32 func; @@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg { /** * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function * @func: [in] supplicant function - * @num_params [in/out] number of parameters following this struct + * @num_params: [in/out] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters * * @num_params is the number of params that tee-supplicant has room to * receive when input, @num_params is the number of actual params @@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg { /** * struct tee_iocl_supp_send_arg - Send a response to a received request * @ret: [out] return value - * @num_params [in] number of parameters following this struct + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_iocl_supp_send_arg { __u32 ret; @@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data { */ /** - * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * struct tee_ioctl_object_invoke_arg - Invokes an object in a + * Trusted Application * @id: [in] Object id * @op: [in] Object operation, specific to the object * @ret: [out] return value * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_object_invoke_arg { __u64 id; diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 8bf27ab8bcb4..1db45b01532b 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -193,7 +193,8 @@ struct virtio_net_hdr_v1 { struct virtio_net_hdr_v1_hash { struct virtio_net_hdr_v1 hdr; - __le32 hash_value; + __le16 hash_value_lo; + __le16 hash_value_hi; #define VIRTIO_NET_HASH_REPORT_NONE 0 #define VIRTIO_NET_HASH_REPORT_IPv4 1 #define VIRTIO_NET_HASH_REPORT_TCPv4 2 diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 9425cfd9d00e..0f95576bf1f6 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -688,6 +688,13 @@ enum ufshcd_quirks { * single doorbell mode. */ UFSHCD_QUIRK_BROKEN_LSDBS_CAP = 1 << 25, + + /* + * This quirk indicates that DME_LINKSTARTUP should not be issued a 2nd + * time (refer link_startup_again) after the 1st time was successful, + * because it causes link startup to become unreliable. + */ + UFSHCD_QUIRK_PERFORM_LINK_STARTUP_ONCE = 1 << 26, }; enum ufshcd_caps { diff --git a/init/Kconfig b/init/Kconfig index cab3ad28ca49..d1c606ec632e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1913,10 +1913,36 @@ config RSEQ If unsure, say Y. +config RSEQ_STATS + default n + bool "Enable lightweight statistics of restartable sequences" if EXPERT + depends on RSEQ && DEBUG_FS + help + Enable lightweight counters which expose information about the + frequency of RSEQ operations via debugfs. Mostly interesting for + kernel debugging or performance analysis. While lightweight it's + still adding code into the user/kernel mode transitions. + + If unsure, say N. + +config RSEQ_DEBUG_DEFAULT_ENABLE + default n + bool "Enable restartable sequences debug mode by default" if EXPERT + depends on RSEQ + help + This enables the static branch for debug mode of restartable + sequences. + + This also can be controlled on the kernel command line via the + command line parameter "rseq_debug=0/1" and through debugfs. + + If unsure, say N. + config DEBUG_RSEQ default n bool "Enable debugging of rseq() system call" if EXPERT - depends on RSEQ && DEBUG_KERNEL + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY + select RSEQ_DEBUG_DEFAULT_ENABLE help Enable extra debugging checks for the rseq system call. diff --git a/init/do_mounts.c b/init/do_mounts.c index 6af29da8889e..64d5e25a2cb5 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -120,7 +120,8 @@ static int __init fs_names_setup(char *str) static unsigned int __initdata root_delay; static int __init root_delay_setup(char *str) { - root_delay = simple_strtoul(str, NULL, 0); + if (kstrtouint(str, 0, &root_delay)) + return 0; return 1; } diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 19d9f33dcacf..eddbe5cb0413 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -29,8 +29,7 @@ int __initdata rd_image_start; /* starting block # of image */ static int __init ramdisk_start_setup(char *str) { - rd_image_start = simple_strtol(str,NULL,0); - return 1; + return kstrtoint(str, 0, &rd_image_start) == 0; } __setup("ramdisk_start=", ramdisk_start_setup); diff --git a/init/init_task.c b/init/init_task.c index a55e2189206f..49b13d7c3985 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -62,6 +62,33 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = { }; #endif +/* init to 2 - one for init_task, one to ensure it is never freed */ +static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; + +/* + * The initial credentials for the initial task + */ +static struct cred init_cred = { + .usage = ATOMIC_INIT(4), + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, + .suid = GLOBAL_ROOT_UID, + .sgid = GLOBAL_ROOT_GID, + .euid = GLOBAL_ROOT_UID, + .egid = GLOBAL_ROOT_GID, + .fsuid = GLOBAL_ROOT_UID, + .fsgid = GLOBAL_ROOT_GID, + .securebits = SECUREBITS_DEFAULT, + .cap_inheritable = CAP_EMPTY_SET, + .cap_permitted = CAP_FULL_SET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, + .user = INIT_USER, + .user_ns = &init_user_ns, + .group_info = &init_groups, + .ucounts = &init_ucounts, +}; + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) @@ -223,6 +250,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_SECCOMP_FILTER .seccomp = { .filter_count = ATOMIC_INIT(0) }, #endif +#ifdef CONFIG_SCHED_MM_CID + .mm_cid = { .cid = MM_CID_UNSET, }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index d071835121c2..375726e05f69 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,8 +8,7 @@ #include <linux/utsname.h> struct uts_namespace init_uts_ns = { - .ns.ns_type = ns_common_type(&init_uts_ns), - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_uts_ns), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, @@ -19,10 +18,6 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_uts_ns), -#ifdef CONFIG_UTS_NS - .ns.ops = &utsns_operations, -#endif }; /* FIXED STRINGS! Don't touch! */ diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index 27a09aa4c9d0..3b75931bd569 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -127,7 +127,7 @@ static int io_uring_cmd_timestamp(struct socket *sock, if (!unlikely(skb_queue_empty(&list))) { scoped_guard(spinlock_irqsave, &q->lock) - skb_queue_splice(q, &list); + skb_queue_splice(&list, q); } return -EAGAIN; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 296667ba712c..02339b74ba8d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -634,6 +634,8 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) is_cqe32 = true; cqe_size <<= 1; } + if (ctx->flags & IORING_SETUP_CQE32) + is_cqe32 = false; if (!dying) { if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 2e99dffddfc5..add03ca75cb9 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -135,7 +135,7 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long size = (size_t) mr->nr_pages << PAGE_SHIFT; struct page **pages; int nr_pages; diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 45d3735b2708..3ffac8f72974 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -211,10 +211,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag const struct file_operations *fops = &io_mock_fops; const struct io_uring_sqe *sqe = cmd->sqe; struct io_uring_mock_create mc, __user *uarg; - struct io_mock_file *mf = NULL; - struct file *file = NULL; + struct file *file; + struct io_mock_file *mf __free(kfree) = NULL; size_t uarg_size; - int fd = -1, ret; /* * It's a testing only driver that allows exercising edge cases @@ -246,10 +245,6 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag if (!mf) return -ENOMEM; - ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); - if (fd < 0) - goto fail; - init_waitqueue_head(&mf->poll_wq); mf->size = mc.file_size; mf->rw_delay_ns = mc.rw_delay_ns; @@ -258,33 +253,25 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag mf->pollable = true; } - file = anon_inode_create_getfile("[io_uring_mock]", fops, - mf, O_RDWR | O_CLOEXEC, NULL); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto fail; - } + FD_PREPARE(fdf, O_RDWR | O_CLOEXEC, + anon_inode_create_getfile("[io_uring_mock]", fops, mf, + O_RDWR | O_CLOEXEC, NULL)); + if (fdf.err) + return fdf.err; - file->f_mode |= FMODE_READ | FMODE_CAN_READ | - FMODE_WRITE | FMODE_CAN_WRITE | - FMODE_LSEEK; + retain_and_null_ptr(mf); + file = fd_prepare_file(fdf); + file->f_mode |= FMODE_READ | FMODE_CAN_READ | FMODE_WRITE | + FMODE_CAN_WRITE | FMODE_LSEEK; if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) file->f_mode |= FMODE_NOWAIT; - mc.out_fd = fd; - if (copy_to_user(uarg, &mc, uarg_size)) { - fput(file); - ret = -EFAULT; - goto fail; - } + mc.out_fd = fd_prepare_fd(fdf); + if (copy_to_user(uarg, &mc, uarg_size)) + return -EFAULT; - fd_install(fd, file); + fd_publish(fdf); return 0; -fail: - if (fd >= 0) - put_unused_fd(fd); - kfree(mf); - return ret; } static int io_probe_mock(struct io_uring_cmd *cmd) diff --git a/io_uring/net.c b/io_uring/net.c index a95cc9ca2a4d..43d77f95db51 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1532,8 +1532,10 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; int ret; - ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, - &kmsg->vec, uvec_segs, issue_flags); + sr->notif->buf_index = req->buf_index; + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, + sr->notif, &kmsg->vec, uvec_segs, + issue_flags); if (unlikely(ret)) return ret; req->flags &= ~REQ_F_IMPORT_BUFFER; diff --git a/io_uring/query.c b/io_uring/query.c index 645301bd2c82..cf02893ba911 100644 --- a/io_uring/query.c +++ b/io_uring/query.c @@ -20,6 +20,8 @@ static ssize_t io_query_ops(void *data) e->ring_setup_flags = IORING_SETUP_FLAGS; e->enter_flags = IORING_ENTER_FLAGS; e->sqe_flags = SQE_VALID_FLAGS; + e->nr_query_opcodes = __IO_URING_QUERY_MAX; + e->__pad = 0; return sizeof(*e); } diff --git a/io_uring/register.c b/io_uring/register.c index 2e4717f1357c..d189b266b8cc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -827,9 +827,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; - case IORING_REGISTER_ZCRX_REFILL: - ret = io_zcrx_return_bufs(ctx, arg, nr_args); - break; default: ret = -EINVAL; break; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d787c16dc1c3..0010c4992490 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -943,8 +943,8 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, struct req_iterator rq_iter; struct io_mapped_ubuf *imu; struct io_rsrc_node *node; - struct bio_vec bv, *bvec; - u16 nr_bvecs; + struct bio_vec bv; + unsigned int nr_bvecs = 0; int ret = 0; io_ring_submit_lock(ctx, issue_flags); @@ -965,8 +965,11 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, goto unlock; } - nr_bvecs = blk_rq_nr_phys_segments(rq); - imu = io_alloc_imu(ctx, nr_bvecs); + /* + * blk_rq_nr_phys_segments() may overestimate the number of bvecs + * but avoids needing to iterate over the bvecs + */ + imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); if (!imu) { kfree(node); ret = -ENOMEM; @@ -977,16 +980,15 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, imu->len = blk_rq_bytes(rq); imu->acct_pages = 0; imu->folio_shift = PAGE_SHIFT; - imu->nr_bvecs = nr_bvecs; refcount_set(&imu->refs, 1); imu->release = release; imu->priv = rq; imu->is_kbuf = true; imu->dir = 1 << rq_data_dir(rq); - bvec = imu->bvec; rq_for_each_bvec(bv, rq, rq_iter) - *bvec++ = bv; + imu->bvec[nr_bvecs++] = bv; + imu->nr_bvecs = nr_bvecs; node->buf = imu; data->nodes[index] = node; @@ -1403,8 +1405,11 @@ static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, size_t max_segs = 0; unsigned i; - for (i = 0; i < nr_iovs; i++) + for (i = 0; i < nr_iovs; i++) { max_segs += (iov[i].iov_len >> shift) + 2; + if (max_segs > INT_MAX) + return -EOVERFLOW; + } return max_segs; } @@ -1510,7 +1515,11 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, if (unlikely(ret)) return ret; } else { - nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + int ret = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (ret < 0) + return ret; + nr_segs = ret; } if (sizeof(struct bio_vec) > sizeof(struct iovec)) { diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b2241a5813c..6310a3d08409 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -277,7 +277,6 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } else { rw->kiocb.ki_ioprio = get_current_ioprio(); } - rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); @@ -463,7 +462,10 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { + struct io_async_rw *rw = req->async_data; + lockdep_assert_held(&req->ctx->uring_lock); + io_vec_free(&rw->vec); io_rw_recycle(req, 0); } @@ -566,15 +568,6 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct kiocb *kiocb = &rw->kiocb; - - if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { - long res = kiocb->dio_complete(rw->kiocb.private); - - io_req_set_res(req, io_fixup_rw_res(req, res), 0); - } - io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) @@ -589,10 +582,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); - if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - __io_complete_rw_common(req, res); - io_req_set_res(req, io_fixup_rw_res(req, res), 0); - } + __io_complete_rw_common(req, res); + io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a816f5902091..b1b723222cdb 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -928,74 +928,6 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; -#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) -#define IO_ZCRX_SYS_REFILL_BATCH 32 - -static void io_return_buffers(struct io_zcrx_ifq *ifq, - struct io_uring_zcrx_rqe *rqes, unsigned nr) -{ - int i; - - for (i = 0; i < nr; i++) { - struct net_iov *niov; - netmem_ref netmem; - - if (!io_parse_rqe(&rqes[i], ifq, &niov)) - continue; - - scoped_guard(spinlock_bh, &ifq->rq_lock) { - if (!io_zcrx_put_niov_uref(niov)) - continue; - } - - netmem = net_iov_to_netmem(niov); - if (!page_pool_unref_and_test(netmem)) - continue; - io_zcrx_return_niov(niov); - } -} - -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg) -{ - struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; - struct io_uring_zcrx_rqe __user *user_rqes; - struct io_uring_zcrx_sync_refill zr; - struct io_zcrx_ifq *ifq; - unsigned nr, i; - - if (nr_arg) - return -EINVAL; - if (copy_from_user(&zr, arg, sizeof(zr))) - return -EFAULT; - if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) - return -EINVAL; - if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) - return -EINVAL; - - ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); - if (!ifq) - return -EINVAL; - nr = zr.nr_entries; - user_rqes = u64_to_user_ptr(zr.rqes); - - for (i = 0; i < nr;) { - unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); - size_t size = batch * sizeof(rqes[0]); - - if (copy_from_user(rqes, user_rqes + i, size)) - return i ? i : -EFAULT; - io_return_buffers(ifq, rqes, batch); - - i += batch; - - if (fatal_signal_pending(current)) - return i; - cond_resched(); - } - return nr; -} - static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 33ef61503092..a48871b5adad 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,8 +63,6 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -97,11 +95,6 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } -static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg) -{ - return -EOPNOTSUPP; -} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 093551fe66a7..56e811f9e5fa 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -892,15 +892,35 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } +static struct file *mqueue_file_open(struct filename *name, + struct vfsmount *mnt, int oflag, bool ro, + umode_t mode, struct mq_attr *attr) +{ + struct dentry *dentry; + struct file *file; + int ret; + + dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name)); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + + ret = prepare_open(dentry, oflag, ro, mode, name, attr); + file = ERR_PTR(ret); + if (!ret) { + const struct path path = { .mnt = mnt, .dentry = dentry }; + file = dentry_open(&path, oflag, current_cred()); + } + + end_creating(dentry); + return file; +} + static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, struct mq_attr *attr) { + struct filename *name __free(putname) = NULL;; struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt; - struct dentry *root = mnt->mnt_root; - struct filename *name; - struct path path; - int fd, error; - int ro; + int fd, ro; audit_mq_open(oflag, mode, attr); @@ -908,37 +928,10 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, if (IS_ERR(name)) return PTR_ERR(name); - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - goto out_putname; - ro = mnt_want_write(mnt); /* we'll drop it in any case */ - inode_lock(d_inode(root)); - path.dentry = lookup_noperm(&QSTR(name->name), root); - if (IS_ERR(path.dentry)) { - error = PTR_ERR(path.dentry); - goto out_putfd; - } - path.mnt = mntget(mnt); - error = prepare_open(path.dentry, oflag, ro, mode, name, attr); - if (!error) { - struct file *file = dentry_open(&path, oflag, current_cred()); - if (!IS_ERR(file)) - fd_install(fd, file); - else - error = PTR_ERR(file); - } - path_put(&path); -out_putfd: - if (error) { - put_unused_fd(fd); - fd = error; - } - inode_unlock(d_inode(root)); + fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr)); if (!ro) mnt_drop_write(mnt); -out_putname: - putname(name); return fd; } @@ -957,7 +950,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) int err; struct filename *name; struct dentry *dentry; - struct inode *inode = NULL; + struct inode *inode; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; struct vfsmount *mnt = ipc_ns->mq_mnt; @@ -969,26 +962,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); - dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root); + dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); - goto out_unlock; + goto out_drop_write; } inode = d_inode(dentry); - if (!inode) { - err = -ENOENT; - } else { - ihold(inode); - err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), - dentry, NULL); - } - dput(dentry); - -out_unlock: - inode_unlock(d_inode(mnt->mnt_root)); + ihold(inode); + err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root), + dentry, NULL); + end_removing(dentry); iput(inode); + +out_drop_write: mnt_drop_write(mnt); out_name: putname(name); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 7a03f6d03de3..e28f0cecb2ec 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,13 +27,8 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.__ns_ref = REFCOUNT_INIT(1), + .ns = NS_COMMON_INIT(init_ipc_ns), .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_ipc_ns), -#ifdef CONFIG_IPC_NS - .ns.ops = &ipcns_operations, -#endif - .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index 59b12fcb40bd..c0dbfdd9015f 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,6 +66,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (err) goto fail_free; + ns_tree_gen_id(ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; @@ -86,7 +87,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, sem_init_ns(ns); shm_init_ns(ns); - ns_tree_add(ns); + ns_tree_add_raw(ns); return ns; diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/acct.c b/kernel/acct.c index 61630110e29d..2a2b3c874acd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct) static void acct_write_process(struct bsd_acct_struct *acct) { struct file *file = acct->file; - const struct cred *cred; acct_t *ac = &acct->ac; /* Perform file operations on behalf of whoever enabled accounting */ - cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. Then get freeze protection. If - * the fs is frozen, just skip the write as we could deadlock - * the system otherwise. - */ - if (check_free_space(acct) && file_start_write_trylock(file)) { - /* it's been opened O_APPEND, so position is irrelevant */ - loff_t pos = 0; - __kernel_write(file, ac, sizeof(acct_t), &pos); - file_end_write(file); + scoped_with_creds(file->f_cred) { + /* + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. + */ + if (check_free_space(acct) && file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, ac, sizeof(acct_t), &pos); + file_end_write(file); + } } - - revert_creds(cred); } static void do_acct_process(struct bsd_acct_struct *acct) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 6ac35430c573..eec60b57bd3d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -634,37 +634,24 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; - struct file *file; unsigned int flags; - int err, fd; + int err; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + + FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags)); + if (fdf.err) + return fdf.err; iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link); + err = prepare_seq_file(fd_prepare_file(fdf), iter_link); if (err) - goto free_file; + return err; /* Automatic cleanup handles fput */ - fd_install(fd, file); - return fd; - -free_file: - fput(file); -free_fd: - put_unused_fd(fd); - return err; + return fd_publish(fdf); } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..e4007fea4909 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4169,7 +4169,8 @@ release_prog: } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4178,15 +4179,17 @@ release_prog: * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } @@ -4376,9 +4380,9 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..ff16c631951b 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -355,7 +355,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a129746bd6c..6cde6a46babf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2330,7 +2330,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (!in_irq() && !irqs_disabled()) + if (!in_hardirq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2428,7 +2428,7 @@ static void __bpf_prog_put(struct bpf_prog *prog) struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 0bbe412f854e..feecd8f4dbf9 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = { int bpf_token_create(union bpf_attr *attr) { + struct bpf_token *token __free(kfree) = NULL; struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; - struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; - int err, fd; + int err; if (fd_empty(f)) return -EBADF; @@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr) inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - return PTR_ERR(file); - } + FD_PREPARE(fdf, O_CLOEXEC, + alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, + O_RDWR, &bpf_token_fops)); + if (fdf.err) + return fdf.err; token = kzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } + if (!token) + return -ENOMEM; atomic64_set(&token->refcnt, 1); - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - + /* remember bpffs owning userns for future ns_capable() checks. */ + token->userns = userns; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; @@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr) err = security_bpf_token_create(token, attr, &path); if (err) - goto out_token; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } - - file->private_data = token; - fd_install(fd, file); - - return fd; + return err; -out_token: - bpf_token_free(token); -out_file: - fput(file); - return err; + get_user_ns(token->userns); + fd_prepare_file(fdf)->private_data = no_free_ptr(token); + return fd_publish(fdf); } int bpf_token_get_info_by_fd(struct bpf_token *token, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..f2cb0b097093 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -479,11 +479,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..fbe4bb91c564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; @@ -12259,8 +12261,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12331,13 +12333,13 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..ae1eb7a85eb4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, - .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } @@ -5363,7 +5360,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; @@ -5386,11 +5382,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..4aaad07b0bd1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1391,7 +1391,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, return isolcpus_updated; } -static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +static void update_isolation_cpumasks(bool isolcpus_updated) { int ret; @@ -1402,6 +1402,9 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated) ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + + ret = tmigr_isolated_exclude_cpumask(isolated_cpus); + WARN_ON_ONCE(ret < 0); } /** @@ -1555,7 +1558,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, list_add(&cs->remote_sibling, &remote_children); cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1596,7 +1599,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); /* @@ -1665,7 +1668,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if (adding || deleting) cpuset_force_rebuild(); @@ -2023,7 +2026,7 @@ write_error: WARN_ON_ONCE(parent->nr_subparts < 0); } spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -3043,7 +3046,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -4180,7 +4183,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fdbe57578e68..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); - ns_tree_add(new_ns); return no_free_ptr(new_ns); } @@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } diff --git a/kernel/cpu.c b/kernel/cpu.c index db9f6c539b28..b674fdf96208 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE struct cpumask __cpu_possible_mask __ro_after_init = {CPU_BITS_ALL}; +unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; #else struct cpumask __cpu_possible_mask __ro_after_init; +unsigned int __num_possible_cpus __ro_after_init; #endif EXPORT_SYMBOL(__cpu_possible_mask); +EXPORT_SYMBOL(__num_possible_cpus); struct cpumask __cpu_online_mask __read_mostly; EXPORT_SYMBOL(__cpu_online_mask); @@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src) void init_cpu_possible(const struct cpumask *src) { cpumask_copy(&__cpu_possible_mask, src); + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); } void set_cpu_online(unsigned int cpu, bool online) @@ -3140,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online) } /* + * This should be marked __init, but there is a boatload of call sites + * which need to be fixed up to do so. Sigh... + */ +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) { + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus++; + } else { + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) + __num_possible_cpus--; + } +} + +/* * Activate the first processor. */ void __init boot_cpu_init(void) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6f686b30da1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ @@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +411,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1f9ee9759426..f973e7e73c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, case PCI_P2PDMA_MAP_BUS_ADDR: sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, sg_phys(sg)); + sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; default: diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f62e1d1b2063..5c792b30c58a 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,19 +11,20 @@ /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - * @regs: Pointer to pt_regs on entry stack - * @ti_work: TIF work flags as read by the caller - */ -__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work) +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) +#else +#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) +#endif + +static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { /* * Before returning to user space ensure that all pending work * items have been completed. */ - while (ti_work & EXIT_TO_USER_MODE_WORK) { + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { local_irq_enable_exit_to_user(ti_work); @@ -62,17 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + * @regs: Pointer to pt_regs on entry stack + * @ti_work: TIF work flags as read by the caller + */ +__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work) { - enter_from_user_mode(regs); -} + for (;;) { + ti_work = __exit_to_user_mode_loop(regs, ti_work); -noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - exit_to_user_mode_prepare(regs); - instrumentation_end(); - exit_to_user_mode(); + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) + return ti_work; + ti_work = read_thread_flags(); + } } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c index 66e6ba7fa80c..940a597ded40 100644 --- a/kernel/entry/syscall-common.c +++ b/kernel/entry/syscall-common.c @@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, return ret ? : syscall; } -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* * If SYSCALL_EMU is set, then the only reason to report is when * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 177e57c1a362..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -11773,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -11819,15 +11891,20 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; /* - * The throttle can be triggered in the hrtimer handler. - * The HRTIMER_NORESTART should be used to stop the timer, - * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. */ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -11871,12 +11948,14 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) cpu_clock_event_update(event); @@ -11893,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -11950,12 +12029,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) task_clock_event_update(event, event->ctx->time); @@ -14799,6 +14880,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..b9667ffcf7b3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -291,6 +291,7 @@ repeat: write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -910,6 +911,7 @@ void __noreturn do_exit(long code) user_events_exit(tsk); io_uring_files_cancel(); + sched_mm_cid_exit(tsk); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); @@ -939,7 +941,6 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* @@ -950,6 +951,12 @@ void __noreturn do_exit(long code) * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); @@ -962,7 +969,7 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..83e05d6f2307 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,10 +955,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_SCHED_MM_CID - tsk->mm_cid = -1; - tsk->last_mm_cid = -1; - tsk->mm_cid_active = 0; - tsk->migrate_from_cpu = -1; + tsk->mm_cid.cid = MM_CID_UNSET; + tsk->mm_cid.active = 0; #endif return tsk; @@ -2453,9 +2451,10 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } @@ -2487,6 +2486,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 125804fbb5cb..cf7e610eac42 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -581,7 +581,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (flags & FLAGS_NUMA) { u32 __user *naddr = (void *)uaddr + size / 2; - if (futex_get_value(&node, naddr)) + if (get_user_inline(node, naddr)) return -EFAULT; if ((node != FUTEX_NO_NODE) && @@ -601,7 +601,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, node = numa_node_id(); node_updated = true; } - if (node_updated && futex_put_value(node, naddr)) + if (node_updated && put_user_inline(node, naddr)) return -EFAULT; } @@ -1680,10 +1680,10 @@ static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_inc(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_inc(*mm->futex_ref); return true; } @@ -1694,10 +1694,10 @@ static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_dec(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_dec(*mm->futex_ref); return false; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 2cd57096c38e..30c2afa03889 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -281,63 +281,11 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 return ret; } -/* - * This does a plain atomic user space read, and the user pointer has - * already been verified earlier by get_futex_key() to be both aligned - * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). - * - * We still want to avoid any speculation, and while __get_user() is - * the traditional model for this, it's actually slower than doing - * this manually these days. - * - * We could just have a per-architecture special function for it, - * the same way we do futex_atomic_cmpxchg_inatomic(), but rather - * than force everybody to do that, write it out long-hand using - * the low-level user-access infrastructure. - * - * This looks a bit overkill, but generally just results in a couple - * of instructions. - */ -static __always_inline int futex_get_value(u32 *dest, u32 __user *from) -{ - u32 val; - - if (can_do_masked_user_access()) - from = masked_user_access_begin(from); - else if (!user_read_access_begin(from, sizeof(*from))) - return -EFAULT; - unsafe_get_user(val, from, Efault); - user_read_access_end(); - *dest = val; - return 0; -Efault: - user_read_access_end(); - return -EFAULT; -} - -static __always_inline int futex_put_value(u32 val, u32 __user *to) -{ - if (can_do_masked_user_access()) - to = masked_user_access_begin(to); - else if (!user_write_access_begin(to, sizeof(*to))) - return -EFAULT; - unsafe_put_user(val, to, Efault); - user_write_access_end(); - return 0; -Efault: - user_write_access_end(); - return -EFAULT; -} - +/* Read from user memory with pagefaults disabled */ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { - int ret; - - pagefault_disable(); - ret = futex_get_value(dest, from); - pagefault_enable(); - - return ret; + guard(pagefault)(); + return get_user_inline(*dest, from); } extern void __futex_unqueue(struct futex_q *q); diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d1917b28761a..678f094d261a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -897,8 +897,9 @@ void handle_percpu_irq(struct irq_desc *desc) void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); + unsigned int cpu = smp_processor_id(); + struct irqaction *action; irqreturn_t res; /* @@ -910,12 +911,15 @@ void handle_percpu_devid_irq(struct irq_desc *desc) if (chip->irq_ack) chip->irq_ack(&desc->irq_data); + for (action = desc->action; action; action = action->next) + if (cpumask_test_cpu(cpu, action->affinity)) + break; + if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { - unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) @@ -929,31 +933,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * Similar to handle_fasteoi_nmi, but handling the dev_id cookie - * as a percpu pointer. - */ -void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); -} - static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e103451243a0..786f5570a640 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,15 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) */ atomic_inc(&desc->threads_active); - wake_up_process(action->thread); + /* + * This might be a premature wakeup before the thread reached the + * thread function and set the IRQTF_READY bit. It's waiting in + * kthread code with state UNINTERRUPTIBLE. Once it reaches the + * thread function it waits with INTERRUPTIBLE. The wakeup is not + * lost in that case because the thread is guaranteed to observe + * the RUN flag before it goes to sleep in wait_for_interrupt(). + */ + wake_up_state(action->thread, TASK_INTERRUPTIBLE); } static DEFINE_STATIC_KEY_FALSE(irqhandler_duration_check_enabled); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index db714d3014b5..6acf268f005b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -879,8 +879,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity) +int irq_set_percpu_devid(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -892,31 +891,10 @@ int irq_set_percpu_devid_partition(unsigned int irq, if (!desc->percpu_enabled) return -ENOMEM; - desc->percpu_affinity = affinity ? : cpu_possible_mask; - irq_set_percpu_devid_flags(irq); return 0; } -int irq_set_percpu_devid(unsigned int irq) -{ - return irq_set_percpu_devid_partition(irq, NULL); -} - -int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc || !desc->percpu_enabled) - return -EINVAL; - - if (affinity) - cpumask_copy(affinity, desc->percpu_affinity); - - return 0; -} -EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc473faadcc8..2652c4cfd877 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -867,13 +867,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, } EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec); -unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +static struct irq_domain *fwspec_to_domain(struct irq_fwspec *fwspec) { struct irq_domain *domain; - struct irq_data *irq_data; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - int virq; if (fwspec->fwnode) { domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); @@ -883,6 +879,32 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) domain = irq_default_domain; } + return domain; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + struct irq_domain *domain = fwspec_to_domain(fwspec); + + memset(info, 0, sizeof(*info)); + + if (!domain || !domain->ops->get_fwspec_info) + return 0; + + return domain->ops->get_fwspec_info(fwspec, info); +} +#endif + +unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) +{ + unsigned int type = IRQ_TYPE_NONE; + struct irq_domain *domain; + struct irq_data *irq_data; + irq_hw_number_t hwirq; + int virq; + + domain = fwspec_to_domain(fwspec); if (!domain) { pr_warn("no irq domain found for %s !\n", of_node_full_name(to_of_node(fwspec->fwnode))); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 400856abf672..0bb29316b436 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -547,7 +547,7 @@ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *noti INIT_WORK(¬ify->work, irq_affinity_notify); } - scoped_guard(raw_spinlock_irqsave, &desc->lock) { + scoped_guard(raw_spinlock_irq, &desc->lock) { old_notify = desc->affinity_notify; desc->affinity_notify = notify; } @@ -1001,7 +1001,6 @@ static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; - bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -1018,21 +1017,13 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a } scoped_guard(raw_spinlock_irq, &desc->lock) { - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; + const struct cpumask *m; - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - valid = true; - } + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); } - if (valid) - set_cpus_allowed_ptr(current, mask); + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -1239,7 +1230,10 @@ static int irq_thread(void *data) irq_thread_set_ready(desc, action); - sched_set_fifo(current); + if (action->handler == irq_forced_secondary_handler) + sched_set_fifo_secondary(current); + else + sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) @@ -1405,19 +1399,39 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) * references an already freed task_struct. */ new->thread = get_task_struct(t); + /* - * Tell the thread to set its affinity. This is - * important for shared interrupt handlers as we do - * not invoke setup_affinity() for the secondary - * handlers as everything is already set up. Even for - * interrupts marked with IRQF_NO_BALANCE this is - * correct as we want the thread to move to the cpu(s) - * on which the requesting code placed the interrupt. + * The affinity can not be established yet, but it will be once the + * interrupt is enabled. Delay and defer the actual setting to the + * thread itself once it is ready to run. In the meantime, prevent + * it from ever being re-affined directly by cpuset or + * housekeeping. The proper way to do it is to re-affine the whole + * vector. */ - set_bit(IRQTF_AFFINITY, &new->thread_flags); + kthread_bind_mask(t, cpu_possible_mask); + + /* + * Ensure the thread adjusts the affinity once it reaches the + * thread function. + */ + new->thread_flags = BIT(IRQTF_AFFINITY); + return 0; } +static bool valid_percpu_irqaction(struct irqaction *old, struct irqaction *new) +{ + do { + if (cpumask_intersects(old->affinity, new->affinity) || + old->percpu_dev_id == new->percpu_dev_id) + return false; + + old = old->next; + } while (old); + + return true; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1438,6 +1452,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; + bool per_cpu_devid; if (!desc) return -EINVAL; @@ -1447,6 +1462,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!try_module_get(desc->owner)) return -ENODEV; + per_cpu_devid = irq_settings_is_per_cpu_devid(desc); + new->irq = irq; /* @@ -1554,13 +1571,20 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ unsigned int oldtype; - if (irq_is_nmi(desc)) { + if (irq_is_nmi(desc) && !per_cpu_devid) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } + if (per_cpu_devid && !valid_percpu_irqaction(old, new)) { + pr_err("Overlapping affinities for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1711,7 +1735,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - } else { + } else if (!per_cpu_devid) { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request @@ -2346,7 +2370,7 @@ void disable_percpu_nmi(unsigned int irq) static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; + struct irqaction *action, **action_ptr; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -2354,21 +2378,33 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ return NULL; scoped_guard(raw_spinlock_irqsave, &desc->lock) { - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - return NULL; + action_ptr = &desc->action; + for (;;) { + action = *action_ptr; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } + + if (action->percpu_dev_id == dev_id) + break; + + action_ptr = &action->next; } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); + if (cpumask_intersects(desc->percpu_enabled, action->affinity)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, + cpumask_first_and(desc->percpu_enabled, action->affinity)); return NULL; } /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - desc->istate &= ~IRQS_NMI; + *action_ptr = action->next; + + /* Demote from NMI if we killed the last action */ + if (!desc->action) + desc->istate &= ~IRQS_NMI; } unregister_handler_proc(irq, action); @@ -2442,17 +2478,49 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) return retval; } +static +struct irqaction *create_percpu_irqaction(irq_handler_t handler, unsigned long flags, + const char *devname, const cpumask_t *affinity, + void __percpu *dev_id) +{ + struct irqaction *action; + + if (!affinity) + affinity = cpu_possible_mask; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return NULL; + + action->handler = handler; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; + action->name = devname; + action->percpu_dev_id = dev_id; + action->affinity = affinity; + + /* + * We allow some form of sharing for non-overlapping affinity + * masks. Obviously, covering all CPUs prevents any sharing in + * the first place. + */ + if (!cpumask_equal(affinity, cpu_possible_mask)) + action->flags |= IRQF_SHARED; + + return action; +} + /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources and enables the interrupt on the - * local CPU. If the interrupt is supposed to be enabled on other CPUs, it - * has to be done on each CPU using enable_percpu_irq(). + * This call allocates interrupt resources, but doesn't enable the interrupt + * on any CPU, as all percpu-devid interrupts are flagged with IRQ_NOAUTOEN. + * It has to be done on each CPU using enable_percpu_irq(). * * @dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of @@ -2460,7 +2528,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *dev_id) + const cpumask_t *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2477,15 +2545,10 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, if (flags && flags != IRQF_TIMER) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, flags, devname, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; - action->name = devname; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); @@ -2508,6 +2571,7 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device + * @affinity: A cpumask describing the target CPUs for this interrupt * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs @@ -2524,8 +2588,8 @@ EXPORT_SYMBOL_GPL(__request_percpu_irq); * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ -int request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *name, void __percpu *dev_id) +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -2542,20 +2606,16 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, !irq_supports_nmi(desc)) return -EINVAL; - /* The line cannot already be NMI */ - if (irq_is_nmi(desc)) + /* The line cannot be NMI already if the new request covers all CPUs */ + if (irq_is_nmi(desc) && + (!affinity || cpumask_equal(affinity, cpu_possible_mask))) return -EINVAL; - action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + action = create_percpu_irqaction(handler, IRQF_NO_THREAD | IRQF_NOBALANCING, + name, affinity, dev_id); if (!action) return -ENOMEM; - action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD - | IRQF_NOBALANCING; - action->name = name; - action->percpu_dev_id = dev_id; - retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index e7ad99254841..68886881fe10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -706,7 +706,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq = ops->get_hwirq(info, arg); int i, ret; - if (irq_find_mapping(domain, hwirq) > 0) + if (irq_resolve_mapping(domain, hwirq)) return -EEXIST; if (domain->parent) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 29c2404e743b..77258eafbf63 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -48,6 +48,8 @@ static int show_irq_affinity(int type, struct seq_file *m) struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; + guard(raw_spinlock_irq)(&desc->lock); + switch (type) { case AFFINITY: case AFFINITY_LIST: diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> #include <linux/debugfs.h> @@ -22,6 +23,7 @@ #include <asm/early_ioremap.h> +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -131,28 +135,28 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { - void *elm, *res; + void *res = xa_load(xa, index); + + if (res) + return res; - elm = xa_load(xa, index); - if (elm) - return elm; + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); @@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); @@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); @@ -862,16 +882,17 @@ err_free: return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); @@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/types.h> + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 31b072e8d427..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 53d51ed619a3..4c0a9c18d0b2 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -18,3 +18,15 @@ config LIVEPATCH module uses the interface provided by this option to register a patch, causing calls to patched functions to be redirected to new function code contained in the patch module. + +config HAVE_KLP_BUILD + bool + help + Arch supports klp-build + +config KLP_BUILD + def_bool y + depends on LIVEPATCH && HAVE_KLP_BUILD + select OBJTOOL + help + Enable klp-build support diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0e73fac55f8e..0044a8125013 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -217,14 +217,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol\n", - strtab + sym->st_name); + pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n", + strtab + sym->st_name, symndx, i); return -EINVAL; } /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%511[^,],%lu", + KLP_SYM_PREFIX "%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -303,7 +303,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * See comment in klp_resolve_symbols() for an explanation * of the selected field width value. */ - cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]", sec_objname); if (cnt != 1) { pr_err("section %s has an incorrectly formatted name\n", diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 949103fd8e9b..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de7d6702cd96..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -43,8 +43,7 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); @@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 2e8080a9bee3..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ #endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index bafd5af98eae..59dbd29cb219 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, @@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock, EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> +#include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS @@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret = 0; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - return proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 19aa64ab08c8..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/nstree.h> static struct kmem_cache *nsproxy_cachep; @@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, @@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - put_mnt_ns(ns->mnt_ns); - put_uts_ns(ns->uts_ns); - put_ipc_ns(ns->ipc_ns); - put_pid_ns(ns->pid_ns_for_children); - put_time_ns(ns->time_ns); - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; @@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) diff --git a/kernel/nstree.c b/kernel/nstree.c index b24a320a11a6..f36c59e6951d 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -1,140 +1,261 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/rculist.h> #include <linux/vfsdebug.h> +#include <linux/syscalls.h> +#include <linux/user_namespace.h> -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - * @type: type of namespaces in this tree - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); + +DEFINE_LOCK_GUARD_0(ns_tree_writer, + write_seqlock(&ns_tree_lock), + write_sequnlock(&ns_tree_lock)) + +DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, + read_seqlock_excl(&ns_tree_lock), + read_sequnlock_excl(&ns_tree_lock)) + +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), }; -struct ns_tree mnt_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), - .type = CLONE_NEWNS, +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), }; -struct ns_tree net_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), - .type = CLONE_NEWNET, +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), }; EXPORT_SYMBOL_GPL(net_ns_tree); -struct ns_tree uts_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), - .type = CLONE_NEWUTS, +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), }; -struct ns_tree user_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), - .type = CLONE_NEWUSER, +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), }; -struct ns_tree ipc_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), - .type = CLONE_NEWIPC, +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), }; -struct ns_tree pid_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), - .type = CLONE_NEWPID, +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), }; -struct ns_tree cgroup_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), - .type = CLONE_NEWCGROUP, +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), }; -struct ns_tree time_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), - .type = CLONE_NEWTIME, +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), }; -DEFINE_COOKIE(namespace_cookie); +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_tree_node); + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); } -static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) { - struct ns_common *ns_a = node_to_ns(a); - struct ns_common *ns_b = node_to_ns(b); - u64 ns_id_a = ns_a->ns_id; - u64 ns_id_b = ns_b->ns_id; + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); +} - if (ns_id_a < ns_id_b) +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); +} + +static int ns_id_cmp(u64 id_a, u64 id_b) +{ + if (id_a < id_b) return -1; - if (ns_id_a > ns_id_b) + if (id_a > id_b) return 1; return 0; } -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +static int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); +} + +static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); +} + +static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) { - struct rb_node *node, *prev; + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + struct rb_node *node; + const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); - write_seqlock(&ns_tree->ns_tree_lock); + guard(ns_tree_writer)(); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->ns_tree_node); - if (!prev) - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); - else - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to unified tree and list */ + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + + /* Add to owner's tree if applicable */ + if (ops) { + struct user_namespace *user_ns; - write_sequnlock(&ns_tree->ns_tree_lock); + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } VFS_WARN_ON_ONCE(node); } -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) { - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); + + write_seqlock(&ns_tree_lock); + + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); + + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); - write_seqlock(&ns_tree->ns_tree_lock); - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - list_bidir_del_rcu(&ns->ns_list_node); - RB_CLEAR_NODE(&ns->ns_tree_node); - write_sequnlock(&ns_tree->ns_tree_lock); + /* Remove from owner's tree if applicable */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); + } + } + + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); @@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node) return 0; } +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); -static struct ns_tree *ns_tree_from_type(int ns_type) + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static struct ns_tree_root *ns_tree_from_type(int ns_type) { switch (ns_type) { case CLONE_NEWCGROUP: @@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type) return NULL; } -struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) { - struct ns_tree *ns_tree; struct rb_node *node; unsigned int seq; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree_root *ns_tree; + struct rb_node *node; + unsigned int seq; ns_tree = ns_tree_from_type(ns_type); if (!ns_tree) return NULL; do { - seq = read_seqbegin(&ns_tree->ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); if (node) break; - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + } while (read_seqretry(&ns_tree_lock, seq)); - if (!node) - return NULL; + return node_to_ns(node); +} - VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); - return node_to_ns(node); + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); } /** - * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same * tree * @ns: namespace to start from + * @ns_tree: namespace tree to search in * @previous: if true find the previous namespace, otherwise the next * * Find the next or previous namespace in the same tree as @ns. If * there is no next/previous namespace, -ENOENT is returned. */ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, bool previous) + struct ns_tree_root *ns_tree, bool previous) { struct list_head *list; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); else - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); - if (list_is_head(list, &ns_tree->ns_list)) + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); - - return list_entry_rcu(list, struct ns_common, ns_list_node); + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); } /** - * ns_tree_gen_id - generate a new namespace id + * __ns_tree_gen_id - generate a new namespace id * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id * * Generates a new namespace id and assigns it to the namespace. All * namespaces types share the same id space and thus can be compared * directly. IOW, when two ids of two namespace are equal, they are * identical. */ -u64 ns_tree_gen_id(struct ns_common *ns) +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) { - guard(preempt)(); - ns->ns_id = gen_cookie_next(&namespace_cookie); + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + guard(ns_tree_locked_reader)(); + + node = owner->ns_owner_root.ns_rb.rb_node; + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static inline void ns_put(struct ns_common *ns) +{ + if (ns && ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); + + ns = first_ns; + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { + struct ns_common *valid; + + if (!nr_ns_ids) + break; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree_root *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + guard(ns_tree_locked_reader)(); + + if (ns_tree) + node = ns_tree->ns_rb.rb_node; + else + node = ns_unified_root.ns_rb.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; + struct ns_tree_root *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list_head; + else + head = &ns_unified_root.ns_list_head; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} diff --git a/kernel/panic.c b/kernel/panic.c index 24cc3eec1805..b2f2470af7e5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -873,13 +873,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint, disable_trace_on_warning(); - if (file) - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", - raw_smp_processor_id(), current->pid, file, line, - caller); - else - pr_warn("WARNING: CPU: %d PID: %d at %pS\n", - raw_smp_processor_id(), current->pid, caller); + if (file) { + pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n", + file, line, caller, + raw_smp_processor_id(), current->comm, current->pid); + } else { + pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n", + caller, + raw_smp_processor_id(), current->comm, current->pid); + } #pragma GCC diagnostic push #ifndef __clang__ diff --git a/kernel/pid.c b/kernel/pid.c index 4fffec767a63..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_pid_ns), -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, -#endif .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif - .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 650be58d8d18..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 53166ef86ba4..26e45f86b955 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -821,8 +821,7 @@ int hibernate(void) goto Restore; ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -928,8 +927,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1079,8 +1077,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b4ca17c2fecf..3d4ebedad69f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..70ae21f7370d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -635,7 +635,7 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. @@ -664,7 +664,7 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } @@ -689,14 +689,14 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory @@ -877,11 +877,14 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); @@ -899,7 +902,8 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 75a84efad40f..392ec2f75f01 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, unsigned long size, void __user *data) { struct ptrace_rseq_configuration conf = { - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = task->rseq_len, - .signature = task->rseq_sig, + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, + .rseq_abi_size = task->rseq.len, + .signature = task->rseq.sig, .flags = 0, }; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c1ebfd51768b..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8293bae1dec1..85b82a7007b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { - local_irq_save(flags); + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6058a734090c..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void) __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d85763336b3c..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { @@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d16afeb11506..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } static bool csd_lock_suppress_rcu_stall; diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..395d8b002350 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,98 +8,7 @@ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/rseq.h> -#include <linux/types.h> -#include <linux/ratelimit.h> -#include <asm/ptrace.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/rseq.h> - -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq_len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq->field, error_label) -#endif - /* - * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing @@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t) * F1. <failure> */ -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; +#include <linux/debugfs.h> +#include <linux/ratelimit.h> +#include <linux/rseq_entry.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <asm/ptrace.h> - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); -efault_end: - user_write_access_end(); -efault: - return -EFAULT; +static inline void rseq_control_debug(bool on) +{ + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); } -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static int __init rseq_setup_debug(char *str) { - struct rseq __user *rseq = t->rseq; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; + bool on; - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; } +__setup("rseq_debug=", rseq_setup_debug); +#ifdef CONFIG_TRACEPOINTS /* - * Get the user-space pointer value stored in the 'rseq_cs' field. + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. */ -static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +void __rseq_trace_update(struct task_struct *t) { - if (!rseq_cs) - return -EFAULT; - -#ifdef CONFIG_64BIT - if (get_user(*rseq_cs, &rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) - return -EFAULT; -#endif + trace_rseq_update(t); +} - return 0; +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); } +#endif /* CONFIG_TRACEPOINTS */ -/* - * If the rseq_cs field of 'struct rseq' contains a valid pointer to - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. - */ -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_stats_show(struct seq_file *m, void *p) { - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; - - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); - if (ret) - return ret; - - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); } - /* Check that the pointer value fits in the user-space process space. */ - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) - return -EFAULT; - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + return 0; +} - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} - if (current->rseq_sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); - return -EINVAL; - } +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); return 0; } +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ -static bool rseq_warn_flags(const char *str, u32 flags) +static int rseq_debug_show(struct seq_file *m, void *p) { - u32 test_flags; - - if (!flags) - return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) { - u32 flags, event_mask; - int ret; + bool on; - if (rseq_warn_flags("rseq_cs", cs_flags)) + if (kstrtobool_from_user(ubuf, count, &on)) return -EINVAL; - /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); - if (ret) - return ret; + rseq_control_debug(on); + return count; +} - if (rseq_warn_flags("rseq", flags)) - return -EINVAL; +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } +static const struct file_operations debug_ops = { + .open = rseq_debug_open, + .read = seq_read, + .write = rseq_debug_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); - return !!event_mask; + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); + return 0; } +__initcall(rseq_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ -static int clear_rseq_cs(struct rseq __user *rseq) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. - * - * Set rseq_cs to NULL. - */ -#ifdef CONFIG_64BIT - return put_user(0UL, &rseq->rseq_cs); -#else - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; } -static int rseq_ip_fixup(struct pt_regs *regs) +static void rseq_slowpath_update_usr(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); + /* + * Preserve rseq state and user_irq state. The generic entry code + * clears user_irq on the way out, the non-generic entry + * architectures are not having user_irq. + */ + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; + struct rseq_ids ids; + u32 node_id; + bool event; + + if (unlikely(t->flags & PF_EXITING)) + return; - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; + rseq_stat_inc(rseq_stats.slowpath); /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) - return ret; - ret = clear_rseq_cs(t->rseq); - if (ret) - return ret; - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + scoped_guard(irq) { + event = t->rseq.event.sched_switch; + t->rseq.event.all &= evt_mask.all; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); + } + + if (!event) + return; + + node_id = cpu_to_node(ids.cpu_id); + + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } } -/* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. - * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. - */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void __rseq_handle_slowpath(struct pt_regs *regs) { - struct task_struct *t = current; - int ret, sig; - - if (unlikely(t->flags & PF_EXITING)) + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) return; + rseq_slowpath_update_usr(regs); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * Don't update IDs, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. */ - if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); } - if (unlikely(rseq_update_cpu_node_id(t))) - goto error; - return; - -error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); } -#ifdef CONFIG_DEBUG_RSEQ - /* * Terminate the process if a syscall is issued within a restartable * sequence. */ -void rseq_syscall(struct pt_regs *regs) +void __rseq_debug_syscall_return(struct pt_regs *regs) { - unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; - struct rseq_cs rseq_cs; + u64 csaddr; - if (!t->rseq) + if (!t->rseq.event.has_rseq) return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); } +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - u64 rseq_cs; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) return -EINVAL; - if (rseq_len != current->rseq_len) + if (rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); return 0; } if (unlikely(flags)) return -EINVAL; - if (current->rseq) { + if (current->rseq.usrptr) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != current->rseq_len) + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) return -EINVAL; - if (current->rseq_sig != sig) + if (current->rseq.sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; @@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. - */ - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) - return -EFAULT; - if (rseq_cs && clear_rseq_cs(rseq)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. */ - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ - rseq_set_notify_resume(current); - + current->rseq.event.has_rseq = true; + rseq_force_update(); return 0; + +efault: + return -EFAULT; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..fc358c1b6ca9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); + rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2128,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; - if (flags & ENQUEUE_MIGRATED) - sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); @@ -2169,37 +2170,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2362,7 +2332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2377,10 +2347,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } void ___migrate_enable(void) @@ -2613,7 +2581,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2622,7 +2591,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2671,6 +2641,8 @@ out_unlock: return 0; } +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2684,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + mm_update_cpus_allowed(p->mm, ctx->new_mask); /* * Swap in a new user_cpus_ptr if SCA_USER flag set @@ -2693,56 +2666,17 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->sched_class->set_cpus_allowed(p, ctx); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2754,7 +2688,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2792,7 +2727,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3064,8 +2999,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, unsigned int dest_cpu; int ret = 0; - update_rq_clock(rq); - if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, @@ -3120,7 +3053,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3329,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - rseq_migrate(p); - sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); } @@ -3529,13 +3460,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3777,7 +3702,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } @@ -4231,7 +4156,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4370,7 +4295,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -4487,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; - init_sched_mm_cid(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4763,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_task_group = tg; } #endif - rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -4827,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -5121,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); - rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5284,19 +5205,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * * kernel -> user switch + mmdrop_lazy_tlb() active * user -> user switch - * - * switch_mm_cid() needs to be updated if the barriers provided - * by context_switch() are modified. */ - if (!next->mm) { // to kernel + if (!next->mm) { // to kernel enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; - if (prev->mm) // from user + if (prev->mm) // from user mmgrab_lazy_tlb(prev->active_mm); else prev->active_mm = NULL; - } else { // to user + } else { // to user membarrier_switch_mm(rq, prev->active_mm, next->mm); /* * sys_membarrier() requires an smp_mb() between setting @@ -5309,15 +5227,20 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_mm_irqs_off(prev->active_mm, next->mm, next); lru_gen_use_mm(next->mm); - if (!prev->mm) { // from kernel + if (!prev->mm) { // from kernel /* will mmdrop_lazy_tlb() in finish_task_switch(). */ rq->prev_mm = prev->active_mm; prev->active_mm = NULL; } } - /* switch_mm_cid() requires the memory barriers above. */ - switch_mm_cid(rq, prev, next); + mm_cid_switch_to(prev, next); + + /* + * Tell rseq that the task was scheduled in. Must be after + * switch_mm_cid() to get the TIF flag set. + */ + rseq_sched_switch_event(next); prepare_lock_switch(rq, next, rf); @@ -5602,7 +5525,6 @@ void sched_tick(void) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); sched_core_tick(rq); - task_tick_mm_cid(rq, donor); scx_tick(rq); rq_unlock(rq, &rf); @@ -5692,7 +5614,7 @@ static void sched_tick_remote(struct work_struct *work) * reasonable amount of time. */ u64 delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); } curr->sched_class->task_tick(rq, curr, 0); @@ -5916,19 +5838,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5972,7 +5881,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5984,11 +5893,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6018,7 +5931,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6026,7 +5943,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6041,7 +5958,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6126,7 +6043,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6146,6 +6066,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6157,7 +6079,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6179,7 +6105,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6812,6 +6738,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6918,7 +6845,6 @@ keep_resched: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -7326,7 +7252,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7388,64 +7314,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); preempt_enable(); } @@ -8084,26 +7997,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -8141,18 +8037,15 @@ static int __balance_push_cpu_stop(void *arg) struct rq_flags rf; int cpu; - raw_spin_lock_irq(&p->pi_lock); - rq_lock(rq, &rf); - - update_rq_clock(rq); - - if (task_rq(p) == rq && task_on_rq_queued(p)) { + scoped_guard (raw_spinlock_irq, &p->pi_lock) { cpu = select_fallback_rq(rq->cpu, p); - rq = __migrate_task(rq, &rf, p, cpu); - } - rq_unlock(rq, &rf); - raw_spin_unlock_irq(&p->pi_lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, cpu); + rq_unlock(rq, &rf); + } put_task_struct(p); @@ -8591,6 +8484,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot @@ -9207,38 +9102,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -9606,7 +9486,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); @@ -10374,557 +10254,571 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) } #ifdef CONFIG_SCHED_MM_CID - -/* - * @cid_lock: Guarantee forward-progress of cid allocation. - * - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock - * is only used when contention is detected by the lock-free allocation so - * forward progress can be guaranteed. - */ -DEFINE_RAW_SPINLOCK(cid_lock); - -/* - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. - * - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is - * detected, it is set to 1 to ensure that all newly coming allocations are - * serialized by @cid_lock until the allocation which detected contention - * completes and sets @use_cid_lock back to 0. This guarantees forward progress - * of a cid allocation. - */ -int use_cid_lock; - /* - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid - * concurrently with respect to the execution of the source runqueue context - * switch. - * - * There is one basic properties we want to guarantee here: - * - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively - * used by a task. That would lead to concurrent allocation of the cid and - * userspace corruption. + * Concurrency IDentifier management * - * Provide this guarantee by introducing a Dekker memory ordering to guarantee - * that a pair of loads observe at least one of a pair of stores, which can be - * shown as: + * Serialization rules: * - * X = Y = 0 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore + * protects mm::mm_cid::users. * - * w[X]=1 w[Y]=1 - * MB MB - * r[Y]=y r[X]=x + * mm::mm_cid::lock: Serializes mm_update_max_cids() and + * mm_update_cpus_allowed(). Nests in mm_cid::mutex + * and runqueue lock. * - * Which guarantees that x==0 && y==0 is impossible. But rather than using - * values 0 and 1, this algorithm cares about specific state transitions of the - * runqueue current task (as updated by the scheduler context switch), and the - * per-mm/cpu cid value. + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks + * and can only be modified with atomic operations. * - * Let's introduce task (Y) which has task->mm == mm and task (N) which has - * task->mm != mm for the rest of the discussion. There are two scheduler state - * transitions on context switch we care about: + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue + * lock. * - * (TSA) Store to rq->curr with transition from (N) to (Y) + * CID ownership: * - * (TSB) Store to rq->curr with transition from (Y) to (N) + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the + * task needs to drop the CID into the pool when scheduling out. Both bits + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is + * actually handed over to user space in the RSEQ memory. * - * On the remote-clear side, there is one transition we care about: + * Mode switching: * - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag + * Switching to per CPU mode happens when the user count becomes greater + * than the maximum number of CIDs, which is calculated by: * - * There is also a transition to UNSET state which can be performed from all - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which - * guarantees that only a single thread will succeed: + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); * - * (TMB) cmpxchg to *pcpu_cid to mark UNSET + * The +25% allowance is useful for tight CPU masks in scenarios where only + * a few threads are created and destroyed to avoid frequent mode + * switches. Though this allowance shrinks, the closer opt_cids becomes to + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. * - * Just to be clear, what we do _not_ want to happen is a transition to UNSET - * when a thread is actively using the cid (property (1)). + * At the point of switching to per CPU mode the new user is not yet + * visible in the system, so the task which initiated the fork() runs the + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and + * either transfers each tasks owned CID to the CPU the task runs on or + * drops it into the CID pool if a task is not on a CPU at that point in + * time. Tasks which schedule in before the task walk reaches them do the + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes + * it's guaranteed that no task related to that MM owns a CID anymore. * - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. + * Switching back to task mode happens when the user count goes below the + * threshold which was recorded on the per CPU mode switch: * - * Scenario A) (TSA)+(TMA) (from next task perspective) + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); * - * CPU0 CPU1 + * This threshold is updated when a affinity change increases the number of + * allowed CPUs for the MM, which might cause a switch back to per task + * mode. * - * Context switch CS-1 Remote-clear - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) - * (implied barrier after cmpxchg) - * - switch_mm_cid() - * - memory barrier (see switch_mm_cid() - * comment explaining how this barrier - * is combined with other scheduler - * barriers) - * - mm_cid_get (next) - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) + * If the switch back was initiated by a exiting task, then that task runs + * the fixup function. If it was initiated by a affinity change, then it's + * run either in the deferred update function in context of a workqueue or + * by a task which forks a new one or by a task which exits. Whatever + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible + * CPUs and either transfers the CPU owned CIDs to a related task which + * runs on the CPU or drops it into the pool. Tasks which schedule in on a + * CPU which the walk did not cover yet do the handover themself. * - * This Dekker ensures that either task (Y) is observed by the - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are - * observed. + * This transition from CPU to per task ownership happens in two phases: * - * If task (Y) store is observed by rcu_dereference(), it means that there is - * still an active task on the cpu. Remote-clear will therefore not transition - * to UNSET, which fulfills property (1). + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task + * CID and denotes that the CID is only temporarily owned by the + * task. When it schedules out the task drops the CID back into the + * pool if this bit is set. * - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), - * it will move its state to UNSET, which clears the percpu cid perhaps - * uselessly (which is not an issue for correctness). Because task (Y) is not - * observed, CPU1 can move ahead to set the state to UNSET. Because moving - * state to UNSET is done with a cmpxchg expecting that the old state has the - * LAZY flag set, only one thread will successfully UNSET. + * 2) The initiating context walks the per CPU space and after completion + * clears mm:mm_cid.transit. So after that point the CIDs are strictly + * task owned again. * - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and - * CPU1 will observe task (Y) and do nothing more, which is fine. + * This two phase transition is required to prevent CID space exhaustion + * during the transition as a direct transfer of ownership would fail if + * two tasks are scheduled in on the same CPU before the fixup freed per + * CPU CIDs. * - * What we are effectively preventing with this Dekker is a scenario where - * neither LAZY flag nor store (Y) are observed, which would fail property (1) - * because this would UNSET a cid which is actively used. + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID + * related to that MM is owned by a CPU anymore. */ -void sched_mm_cid_migrate_from(struct task_struct *t) -{ - t->migrate_from_cpu = task_cpu(t); -} - -static -int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid) +/* + * Update the CID range properties when the constraints change. Invoked via + * fork(), exit() and affinity changes + */ +static void __mm_update_max_cids(struct mm_mm_cid *mc) { - struct mm_struct *mm = t->mm; - struct task_struct *src_task; - int src_cid, last_mm_cid; + unsigned int opt_cids, max_cids; - if (!mm) - return -1; + /* Calculate the new optimal constraint */ + opt_cids = min(mc->nr_cpus_allowed, mc->users); - last_mm_cid = t->last_mm_cid; - /* - * If the migrated task has no last cid, or if the current - * task on src rq uses the cid, it means the source cid does not need - * to be moved to the destination cpu. - */ - if (last_mm_cid == -1) - return -1; - src_cid = READ_ONCE(src_pcpu_cid->cid); - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) - return -1; + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); + WRITE_ONCE(mc->max_cids, max_cids); +} - /* - * If we observe an active task using the mm on this rq, it means we - * are not the last task to be migrated from this cpu for this mm, so - * there is no need to move src_cid to the destination cpu. - */ - guard(rcu)(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - t->last_mm_cid = -1; - return -1; - } +static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) +{ + unsigned int opt_cids; - return src_cid; + opt_cids = min(mc->nr_cpus_allowed, mc->users); + /* Has to be at least 1 because 0 indicates PCPU mode off */ + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); } -static -int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, - struct task_struct *t, - struct mm_cid *src_pcpu_cid, - int src_cid) +static bool mm_update_max_cids(struct mm_struct *mm) { - struct task_struct *src_task; - struct mm_struct *mm = t->mm; - int lazy_cid; + struct mm_mm_cid *mc = &mm->mm_cid; - if (src_cid == -1) - return -1; + lockdep_assert_held(&mm->mm_cid.lock); - /* - * Attempt to clear the source cpu cid to move it to the destination - * cpu. - */ - lazy_cid = mm_cid_set_lazy_put(src_cid); - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) - return -1; - - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Clear deferred mode switch flag. A change is handled by the caller */ + mc->update_deferred = false; + __mm_update_max_cids(mc); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, this task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; - } + /* Check whether owner mode must be changed */ + if (!mc->percpu) { + /* Enable per CPU mode when the number of users is above max_cids */ + if (mc->users > mc->max_cids) + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + } else { + /* Switch back to per task if user count under threshold */ + if (mc->users < mc->pcpu_thrs) + mc->pcpu_thrs = 0; } - /* - * The src_cid is unused, so it can be unset. - */ - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - return -1; - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); - return src_cid; + /* Mode change required? */ + if (!!mc->percpu == !!mc->pcpu_thrs) + return false; + /* When switching back to per TASK mode, set the transition flag */ + if (!mc->pcpu_thrs) + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); + return true; } -/* - * Migration to dst cpu. Called with dst_rq lock held. - * Interrupts are disabled, which keeps the window of cid ownership without the - * source rq lock held small. - */ -void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; - struct mm_struct *mm = t->mm; - int src_cid, src_cpu; - bool dst_cid_is_set; - struct rq *src_rq; - - lockdep_assert_rq_held(dst_rq); + struct cpumask *mm_allowed; + struct mm_mm_cid *mc; + unsigned int weight; - if (!mm) + if (!mm || !READ_ONCE(mm->mm_cid.users)) return; - src_cpu = t->migrate_from_cpu; - if (src_cpu == -1) { - t->last_mm_cid = -1; - return; - } /* - * Move the src cid if the dst cid is unset. This keeps id - * allocation closest to 0 in cases where few threads migrate around - * many CPUs. - * - * If destination cid or recent cid is already set, we may have - * to just clear the src cid to ensure compactness in frequent - * migrations scenarios. - * - * It is not useful to clear the src cid when the number of threads is - * greater or equal to the number of allowed CPUs, because user-space - * can expect that the number of allowed cids can reach the number of - * allowed CPUs. - */ - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) + * mm::mm_cid::mm_cpus_allowed is the superset of each threads + * allowed CPUs mask which means it can only grow. + */ + mc = &mm->mm_cid; + guard(raw_spinlock)(&mc->lock); + mm_allowed = mm_cpus_allowed(mm); + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); + if (weight == mc->nr_cpus_allowed) return; - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); - src_rq = cpu_rq(src_cpu); - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); - if (src_cid == -1) + + WRITE_ONCE(mc->nr_cpus_allowed, weight); + __mm_update_max_cids(mc); + if (!mc->percpu) return; - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, - src_cid); - if (src_cid == -1) + + /* Adjust the threshold to the wider set */ + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); + /* Switch back to per task mode? */ + if (mc->users >= mc->pcpu_thrs) return; - if (dst_cid_is_set) { - __mm_cid_put(mm, src_cid); + + /* Don't queue twice */ + if (mc->update_deferred) return; - } - /* Move src_cid to dst cpu. */ - mm_cid_snapshot_time(dst_rq, mm); - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); + + /* Queue the irq work, which schedules the real work */ + mc->update_deferred = true; + irq_work_queue(&mc->irq_work); } -static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, - int cpu) +static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) { - struct rq *rq = cpu_rq(cpu); - struct task_struct *t; - int cid, lazy_cid; + if (cid_on_cpu(t->mm_cid.cid)) { + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid)) - return; + t->mm_cid.cid = cid_to_transit_cid(cid); + pcp->cid = t->mm_cid.cid; + } +} - /* - * Clear the cpu cid if it is set to keep cid allocation compact. If - * there happens to be other tasks left on the source cpu using this - * mm, the next task using this mm will reallocate its cid on context - * switch. - */ - lazy_cid = mm_cid_set_lazy_put(cid); - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) - return; +static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) +{ + unsigned int cpu; - /* - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm matches the scheduler barrier in context_switch() - * between store to rq->curr and load of prev and next task's - * per-mm/cpu cid. - * - * The implicit barrier after cmpxchg per-mm/cpu cid before loading - * rq->curr->mm_cid_active matches the barrier in - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and - * sched_mm_cid_after_execve() between store to t->mm_cid_active and - * load of per-mm/cpu cid. - */ + /* Walk the CPUs and fixup all stale CIDs */ + for_each_possible_cpu(cpu) { + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); + struct rq *rq = cpu_rq(cpu); - /* - * If we observe an active task using the mm on this rq after setting - * the lazy-put flag, that task will be responsible for transitioning - * from lazy-put flag set to MM_CID_UNSET. - */ - scoped_guard (rcu) { - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) - return; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(rq_lock_irq)(rq); + /* Is the CID still owned by the CPU? */ + if (cid_on_cpu(pcp->cid)) { + /* + * If rq->curr has @mm, transfer it with the + * transition bit set. Otherwise drop it. + */ + if (rq->curr->mm == mm && rq->curr->mm_cid.active) + mm_cid_transit_to_task(rq->curr, pcp); + else + mm_drop_cid_on_cpu(mm, pcp); + + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { + unsigned int cid = rq->curr->mm_cid.cid; + + /* Ensure it has the transition bit set */ + if (!cid_in_transit(cid)) { + cid = cid_to_transit_cid(cid); + rq->curr->mm_cid.cid = cid; + pcp->cid = cid; + } + } } + /* Clear the transition bit */ + WRITE_ONCE(mm->mm_cid.transit, 0); +} - /* - * The cid is unused, so it can be unset. - * Disable interrupts to keep the window of cid ownership without rq - * lock small. - */ - scoped_guard (irqsave) { - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); +static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) +{ + if (cid_on_task(t->mm_cid.cid)) { + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); + pcp->cid = t->mm_cid.cid; } } -static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; +} - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); +static void mm_cid_fixup_tasks_to_cpus(void) +{ + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; } - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + if (!users) return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } } -static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) +static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } -static void task_mm_cid_work(struct callback_head *work) +void sched_mm_cid_fork(struct task_struct *t) { - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; + struct mm_struct *mm = t->mm; + bool percpu; - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; + guard(mutex)(&mm->mm_cid.mutex); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); else - old_scan = next_scan; + mm_cid_transfer_to_cpu(current, pcp); } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); -} -void init_sched_mm_cid(struct task_struct *t) -{ - struct mm_struct *mm = t->mm; - int mm_users = 0; - - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); } -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) +static bool sched_mm_cid_remove_user(struct task_struct *t) { - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); + t->mm_cid.active = 0; + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } -void sched_mm_cid_exit_signals(struct task_struct *t) +static bool __sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) - return; - - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); + if (!sched_mm_cid_remove_user(t)) + return false; /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; + /* + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. + */ + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; } -void sched_mm_cid_before_execve(struct task_struct *t) +/* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ +void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); +} + +/* Deactivate MM CID allocation across execve() */ +void sched_mm_cid_before_execve(struct task_struct *t) +{ + sched_mm_cid_exit(t); } +/* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; + sched_mm_cid_fork(t); +} - if (!mm) +static void mm_cid_work_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); + + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; } + mm_cid_fixup_cpus_to_tasks(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void mm_cid_irq_work(struct irq_work *work) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); - t->mm_cid_active = 1; + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); } -#endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +void mm_init_cid(struct mm_struct *mm, struct task_struct *p) +{ + mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; + raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); +} +#else /* CONFIG_SCHED_MM_CID */ +static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +#endif /* !CONFIG_SCHED_MM_CID */ + +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cdd740b3f774..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 11c0f1faa7e1..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -19,8 +19,6 @@ struct cpudl { int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7097de2c8cda..4f97896887ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - u64 utime, stime; struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; + u64 utime, stime; /* * Update current task runtime to account pending time since last @@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) if (same_thread_group(current, tsk)) (void) task_sched_runtime(current); - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; - for_each_thread(tsk, t) { + __for_each_thread(sig, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += read_sum_exec_runtime(t); } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7b7671060bf9..67f540c23717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); - if (cpumask_subset(rd->span, cpu_active_mask)) - return cpumask_weight(rd->span); - - cpus = 0; - - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; + return cpumask_weight_and(rd->span, cpu_active_mask); } static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) @@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ sched_clock_tick(); update_rq_clock(rq); - if (!dl_se->dl_runtime) + /* + * Make sure current has propagated its pending runtime into + * any relevant server through calling dl_server_update() and + * friends. + */ + rq->donor->sched_class->update_curr(rq); + + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); return HRTIMER_NORESTART; + } if (dl_se->dl_defer_armed) { /* @@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1450,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler * was able to get runtime in background. So return to the initial @@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1543,38 +1573,213 @@ throttle: * as time available for the fair server, avoiding a penalty for the * rt scheduler that did not consumed that time. */ -void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { - s64 delta_exec; - - if (!rq->fair_server.dl_defer) - return; - - /* no need to discount more */ - if (rq->fair_server.runtime < 0) - return; - - delta_exec = rq_clock_task(rq) - p->se.exec_start; - if (delta_exec < 0) - return; - - rq->fair_server.runtime -= delta_exec; - - if (rq->fair_server.runtime < 0) { - rq->fair_server.dl_defer_running = 0; - rq->fair_server.runtime = 0; - } - - p->se.exec_start = rq_clock_task(rq); + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_server_active && dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } +/* + * dl_server && dl_defer: + * + * 6 + * +--------------------+ + * v | + * +-------------+ 4 +-----------+ 5 +------------------+ + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | + * | +-------------+ +-----------+ +------------------+ + * | | | 1 ^ ^ | + * | | 1 +----------+ | 3 | + * | v | | + * | +--------------------------------+ 2 | + * | | | ----+ | + * | 8 | B:zero_laxity-wait | | | + * | | | <---+ | + * | +--------------------------------+ | + * | | ^ ^ 2 | + * | | 7 | 2 +--------------------+ + * | v | + * | +-------------+ | + * +-- | C:idle-wait | -+ + * +-------------+ + * ^ 7 | + * +---------+ + * + * + * [A] - init + * dl_server_active = 0 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 0/1 + * dl_defer_idle = 0 + * + * [B] - zero_laxity-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 0 + * + * [C] - idle-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 1 + * + * [D] - running + * dl_server_active = 1 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * [E] - replenish-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * + * [1] A->B, A->D + * dl_server_start() + * dl_server_active = 1; + * enqueue_dl_entity() + * update_dl_entity(WAKEUP) + * if (!dl_defer_running) + * dl_defer_armed = 1; + * dl_throttled = 1; + * if (dl_throttled && start_dl_timer()) + * return; // [B] + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from client-class + * [2] B->B, C->B, E->B + * dl_server_update() + * update_curr_dl_se() // idle = false + * if (dl_defer_idle) + * dl_defer_idle = 0; + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); // stop timer + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * start_dl_timer(); // restart timer + * // [B] + * + * // timer actually fires means we have runtime + * [3] B->D + * dl_server_timer() + * if (dl_defer_armed) + * dl_defer_running = 1; + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * // fwd period + * if (dl_throttled) + * dl_throttled = 0; + * if (dl_defer_armed) + * dl_defer_armed = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // schedule server + * [4] D->A + * pick_task_dl() + * p = server_pick_task(); + * if (!p) + * dl_server_stop() + * dequeue_dl_entity(); + * hrtimer_try_to_cancel(); + * dl_defer_armed = 0; + * dl_throttled = 0; + * dl_server_active = 0; + * // [A] + * return p; + * + * // server running + * [5] D->E + * update_curr_dl_se() + * if (dl_runtime_exceeded()) + * dl_throttled = 1; + * dequeue_dl_entity(); + * start_dl_timer(); + * // [E] + * + * // server replenished + * [6] E->D + * dl_server_timer() + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * fwd-period + * if (dl_throttled) + * dl_throttled = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from idle + * [7] B->C, C->C + * dl_server_update_idle() + * update_curr_dl_se() // idle = true + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * if (dl_defer_idle) + * return; + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * dl_defer_idle = 1; + * start_dl_timer(); // restart timer + * // [C] + * + * // stop idle server + * [8] C->A + * dl_server_timer() + * if (dl_defer_idle) + * dl_server_stop(); + * // [A] + * + * + * digraph dl_server { + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] + * "A:init" -> "D:running" [label="1:dl_server_start"] + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "D:running" -> "A:init" [label="4:pick_task_dl"] + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] + * } + * + * + * Notes: + * + * - When there are fair tasks running the most likely loop is [2]->[2]. + * the dl_server never actually runs, the timer never fires. + * + * - When there is actual fair starvation; the timer fires and starts the + * dl_server. This will then throttle and replenish like a normal DL + * task. Notably it will not 'defer' again. + * + * - When idle it will push the actication forward once, and then wait + * for the timer to hit or a non-idle update to restart things. + */ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; @@ -1582,6 +1787,11 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || dl_se->dl_server_active) return; + /* + * Update the current task to 'now'. + */ + rq->donor->sched_class->update_curr(rq); + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; @@ -1600,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } @@ -1811,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -2048,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2143,7 +2354,7 @@ static void yield_task_dl(struct rq *rq) * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - rq->curr->dl.dl_yielded = 1; + rq->donor->dl.dl_yielded = 1; update_rq_clock(rq); update_curr_dl(rq); @@ -2173,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct rq *rq; if (!(flags & WF_TTWU)) - goto out; + return cpu; rq = cpu_rq(cpu); @@ -2211,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) } rcu_read_unlock(); -out: return cpu; } @@ -2355,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2369,7 +2579,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2382,9 +2592,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2883,9 +3093,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2894,8 +3105,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2973,7 +3183,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3045,23 +3255,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3094,6 +3305,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3116,6 +3329,7 @@ DEFINE_SCHED_CLASS(dl) = { .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a790..41caa22e0680 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; + zero_vruntime = cfs_rq->zero_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", + SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ecb251e883ea..6827689a0966 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); @@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { struct scx_sched *sch = scx_root; - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { struct scx_sched *sch = scx_root; - struct task_struct *from = rq->curr; + struct task_struct *from = rq->donor; if (SCX_HAS_OP(sch, yield)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, @@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2153,42 +2153,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - maybe_queue_balance_callback(rq); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; - /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). - * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). - */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); + if (rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -2940,9 +2886,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2966,9 +2912,9 @@ void sched_ext_free(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED @@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -4276,7 +4220,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4345,7 +4289,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); @@ -4479,8 +4423,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); - if (!sch->helper) + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); goto err_free_pcpu; + } + sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -4748,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -5321,8 +5264,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; @@ -6401,7 +6344,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25970dbbb279..769d7b7990df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a, static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->min_vruntime); + return (s64)(se->vruntime - cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * Which we track using: * - * v0 := cfs_rq->min_vruntime + * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * - * Since min_vruntime is a monotonic increasing variable that closely tracks - * the per-task service, these deltas: (v_i - v), will be in the order of the - * maximal (virtual) lag induced in the system due to quantisation. + * Since zero_vruntime closely tracks the per-task service, these + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) avg = div_s64(avg, load); } - return cfs_rq->min_vruntime + avg; + return cfs_rq->zero_vruntime + avg; } /* @@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +static void update_zero_vruntime(struct cfs_rq *cfs_rq) { - u64 min_vruntime = cfs_rq->min_vruntime; - /* - * open coded max_vruntime() to allow updating avg_vruntime - */ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) { - avg_vruntime_update(cfs_rq, delta); - min_vruntime = vruntime; - } - return min_vruntime; -} + u64 vruntime = avg_vruntime(cfs_rq); + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_root_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; - - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; - } - - if (se) { - if (!curr) - vruntime = se->min_vruntime; - else - vruntime = min_vruntime(vruntime, se->min_vruntime); - } + avg_vruntime_update(cfs_rq, delta); - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + cfs_rq->zero_vruntime = vruntime; } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) @@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); + update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); + update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; + /* + * Picking the ->next buddy will affect latency but not fairness. + */ + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); + return cfs_rq->next; + } + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) return delta_exec; } +static void set_next_buddy(struct sched_entity *se); + /* * Used by other classes to account runtime. */ @@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->vruntime += calc_delta_fair(delta_exec, curr); resched = update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { /* @@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq) * against fair_server such that it can account for this time * and possibly avoid running this period. */ - if (dl_server_active(&rq->fair_server)) - dl_server_update(&rq->fair_server, delta_exec); + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (!curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; - - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ - update_min_vruntime(cfs_rq); } } @@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- i.e. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); - if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); @@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { struct sched_entity *se; - /* - * Picking the ->next buddy will affect latency but not fairness. - */ - if (sched_feat(PICK_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { - /* ->next will never be delayed */ - WARN_ON_ONCE(cfs_rq->next->sched_delayed); - return cfs_rq->next; - } - se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -6024,20 +5980,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; /* - * It's possible we are called with !runtime_remaining due to things - * like user changed quota setting(see tg_set_cfs_bandwidth()) or async - * unthrottled us with a positive runtime_remaining but other still - * running entities consumed those runtime before we reached here. + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. * - * Anyway, we can't unthrottle this cfs_rq without any runtime remaining - * because any enqueue in tg_unthrottle_up() will immediately trigger a - * throttle, which is not supposed to happen on unthrottle path. + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. */ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; - se = cfs_rq->tg->se[cpu_of(rq)]; - cfs_rq->throttled = 0; update_rq_clock(rq); @@ -7006,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) h_nr_idle = 1; } - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { - /* Account for idle runtime */ - if (!rq->nr_running) - dl_server_update_idle_time(rq, rq->curr); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); - } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -7038,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. @@ -8715,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8735,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se) } } +enum preempt_wakeup_action { + PREEMPT_WAKEUP_NONE, /* No preemption. */ + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ +}; + +static inline bool +set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + /* + * Keep existing buddy if the deadline is sooner than pse. + * The older buddy may be cache cold and completely unrelated + * to the current wakeup but that is unpredictable where as + * obeying the deadline is more in line with EEVDF objectives. + */ + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) + return false; + + set_next_buddy(pse); + return true; +} + +/* + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not + * strictly enforced because the hint is either misunderstood or + * multiple tasks must be woken up. + */ +static inline enum preempt_wakeup_action +preempt_sync(struct rq *rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + u64 threshold, delta; + + /* + * WF_SYNC without WF_TTWU is not expected so warn if it happens even + * though it is likely harmless. + */ + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); + + threshold = sysctl_sched_migration_cost; + delta = rq_clock_task(rq) - se->exec_start; + if ((s64)delta < 0) + delta = 0; + + /* + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they + * could run on other CPUs. Reduce the threshold before preemption is + * allowed to an arbitrary lower value as it is more likely (but not + * guaranteed) the waker requires the wakee to finish. + */ + if (wake_flags & WF_RQ_SELECTED) + threshold >>= 2; + + /* + * As WF_SYNC is not strictly obeyed, allow some runtime for batch + * wakeups to be issued. + */ + if (entity_before(pse, se) && delta >= threshold) + return PREEMPT_WAKEUP_RESCHED; + + return PREEMPT_WAKEUP_NONE; +} + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; - bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8758,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (task_is_throttled(p)) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); - } - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -8793,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - do_preempt_short = true; + preempt_action = PREEMPT_WAKEUP_SHORT; goto preempt; } @@ -8812,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. */ - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto pick; + } /* + * Ignore wakee preemption on WF_FORK as it is less likely that + * there is shared data as exec often follow fork. Do not + * preempt for tasks that are sched_delayed as it would violate + * EEVDF to forcibly queue an ineligible task. + */ + if ((wake_flags & WF_FORK) || pse->sched_delayed) + return; + + /* + * If @p potentially is completing work required by current then + * consider preemption. + * + * Reschedule if waker is no longer eligible. */ + if (in_task() && !entity_eligible(cfs_rq, se)) { + preempt_action = PREEMPT_WAKEUP_RESCHED; + goto preempt; + } + + /* Prefer picking wakee soon if appropriate. */ + if (sched_feat(NEXT_BUDDY) && + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { + + /* + * Decide whether to obey WF_SYNC hint for a new buddy. Old + * buddies are ignored as they may not be relevant to the + * waker and less likely to be cache hot. + */ + if (wake_flags & WF_SYNC) + preempt_action = preempt_sync(rq, wake_flags, pse, se); + } + + switch (preempt_action) { + case PREEMPT_WAKEUP_NONE: + return; + case PREEMPT_WAKEUP_RESCHED: + goto preempt; + case PREEMPT_WAKEUP_SHORT: + fallthrough; + case PREEMPT_WAKEUP_PICK: + break; + } + +pick: + /* * If @p has become the most eligible task, force preemption. */ - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) goto preempt; - if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); return; preempt: - if (do_preempt_short) + if (preempt_action == PREEMPT_WAKEUP_SHORT) cancel_protect_slice(se); resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8876,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8955,14 +9001,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -8993,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t */ static void yield_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *curr = rq->donor; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -9017,7 +9059,18 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -10681,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -11733,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } /* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ @@ -11757,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11768,6 +11837,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -12008,6 +12085,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -12133,21 +12213,6 @@ out_unlock: } /* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - -/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -12156,30 +12221,43 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. - * - * sched_balance_newidle() bumps the cost whenever newidle - * balance fails, and we don't want things to grow out of - * control. Use the sysctl_sched_migration_cost as the upper - * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = - min(cost, sysctl_sched_migration_cost + 200); - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } @@ -12202,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -12211,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12226,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -12246,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; @@ -12827,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12854,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12865,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = t1; /* - * Failing newidle means it is not effective; - * bump the cost so we end up doing less of it. + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. */ - if (!pulled_task) - domain_cost = (3 * sd->max_newidle_lb_cost) / 2; - - update_newidle_cost(sd, domain_cost); + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* @@ -12896,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13015,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) } /* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + * Consider any infeasible weight scenario. Take for instance two tasks, + * each bound to their respective sibling, one with weight 1 and one with + * weight 2. Then the lower weight task will run ahead of the higher weight + * task without bound. + * + * This utterly destroys the concept of a shared time base. + * + * Remember; all this is about a proportionally fair scheduling, where each + * tasks receives: + * + * w_i + * dt_i = ---------- dt (1) + * \Sum_j w_j + * + * which we do by tracking a virtual time, s_i: + * + * 1 + * s_i = --- d[t]_i (2) + * w_i + * + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. + * The immediate corollary is that the ideal schedule S, where (2) to use + * an infinitesimal delta, is: + * + * 1 + * S = ---------- dt (3) + * \Sum_i w_i + * + * From which we can define the lag, or deviation from the ideal, as: + * + * lag(i) = S - s_i (4) + * + * And since the one and only purpose is to approximate S, we get that: + * + * \Sum_i w_i lag(i) := 0 (5) + * + * If this were not so, we no longer converge to S, and we can no longer + * claim our scheduler has any of the properties we derive from S. This is + * exactly what you did above, you broke it! + * + * + * Let's continue for a while though; to see if there is anything useful to + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: + * + * \Sum_i w_i s_i + * S = -------------- (6) + * \Sum_i w_i + * + * Which gives us a way to compute S, given our s_i. Now, if you've read + * our code, you know that we do not in fact do this, the reason for this + * is two-fold. Firstly, computing S in that way requires a 64bit division + * for every time we'd use it (see 12), and secondly, this only describes + * the steady-state, it doesn't handle dynamics. + * + * Anyway, in (6): s_i -> x + (s_i - x), to get: + * + * \Sum_i w_i (s_i - x) + * S - x = -------------------- (7) + * \Sum_i w_i + * + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very + * first counter-example. + * + * Luckily I think we can avoid needing a full multi-queue variant for + * core-scheduling (or load-balancing). The crucial observation is that we + * only actually need this comparison in the presence of forced-idle; only + * then do we need to tell if the stalled rq has higher priority over the + * other. + * + * [XXX assumes SMT2; better consider the more general case, I suspect + * it'll work out because our comparison is always between 2 rqs and the + * answer is only interesting if one of them is forced-idle] + * + * And (under assumption of SMT2) when there is forced-idle, there is only + * a single queue, so everything works like normal. + * + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i + * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k + * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l + * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * + * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * + * + * Combined this gives the following: + * + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, + * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such + * boundaries, but I think we can avoid that. The force-idle is for the + * whole CPU, all it's rqs. So we can mark it in the root and lazily + * propagate downward on demand. + */ + +/* + * So this sync is basically a relative reset of S to 0. + * + * So with 2 queues, when one goes idle, we drop them both to 0 and one + * then increases due to not being idle, and the idle one builds up lag to + * get re-elected. So far so simple, right? + * + * When there's 3, we can have the situation where 2 run and one is idle, + * we sync to 0 and let the idle one build up lag to get re-election. Now + * suppose another one also drops idle. At this point dropping all to 0 + * again would destroy the built-up lag from the queue that was already + * idle, not good. + * + * So instead of syncing everything, we can: + * + * less := !((s64)(s_a - s_b) <= 0) + * + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b + * == v_a - (v_b - S_a + S_b) + * + * IOW, we can recast the (lag) comparison to a one-sided difference. + * So if then, instead of syncing the whole queue, sync the idle queue + * against the active queue with S_a + S_b at the point where we sync. + * + * (XXX consider the implication of living in a cyclic group: N / 2^n N) + * + * This gives us means of syncing single queues against the active queue, + * and for already idle queues to preserve their build-up lag. + * + * Of course, then we get the situation where there's 2 active and one + * going idle, who do we pick to sync against? Theory would have us sync + * against the combined S, but as we've already demonstrated, there is no + * such thing in infeasible weight scenarios. + * + * One thing I've considered; and this is where that core_active rudiment + * came from, is having active queues sync up between themselves after + * every tick. This limits the observed divergence due to the work + * conservancy. + * + * On top of that, we can improve upon things by employing (10) here. + */ + +/* + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) @@ -13029,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, cfs_rq->forceidle_seq = fi_seq; } - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; } } @@ -13082,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, /* * Find delta after normalizing se's vruntime with its cfs_rq's - * min_vruntime_fi, which would have been updated in prior calls + * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13148,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13164,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13249,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13322,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); raw_spin_lock_init(&cfs_rq->removed.lock); } @@ -13623,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13631,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13650,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = { .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, true) /* * Allow completely ignoring cfs_rq->next; which can be set from various @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..1cb7a3d70e65 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,9 +452,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } +static void update_curr_idle(struct rq *rq); + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - dl_server_update_idle_time(rq, prev); + update_curr_idle(rq); scx_update_idle(rq, false, true); } @@ -466,7 +468,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -496,21 +498,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { + update_curr_idle(rq); } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } static void update_curr_idle(struct rq *rq) { + struct sched_entity *se = &rq->idle->se; + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - se->exec_start; + if (unlikely(delta_exec <= 0)) + return; + + se->exec_start = now; + + dl_server_update_idle(&rq->fair_server, delta_exec); } /* @@ -518,6 +535,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -536,6 +555,6 @@ DEFINE_SCHED_CLASS(idle) = { .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index a4cf17b1fab0..3ad0d6df6a0a 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags) } } + /* + * Check the combination of nohz_full and isolcpus=domain, + * necessary to avoid problems with the timer migration + * hierarchy. managed_irq is ignored by this check since it + * isn't considered in the timer migration logic. + */ + iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + type = find_first_bit(&iter_flags, HK_TYPE_MAX); + /* + * Pass the check if none of these flags were previously set or + * are not in the current selection. + */ + iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN); + first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 : + cpumask_first_and_and(cpu_present_mask, + housekeeping_staging, housekeeping.cpumasks[type]); + if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) { + pr_warn("Housekeeping: must include one present CPU " + "neither in nohz_full= nor in isolcpus=domain, " + "ignoring setting %s\n", str); + goto free_housekeeping_staging; + } + iter_flags = flags & ~housekeeping.flags; for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62fba83b7bb1..623445603725 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,7 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_preempt(current); + rseq_sched_switch_event(current); } static void ipi_sync_rq_state(void *info) @@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id) * membarrier, we will end up with some thread in the mm * running without a core sync. * - * For RSEQ, don't rseq_preempt() the caller. User code - * is not supposed to issue syscalls at all from inside an - * rseq critical section. + * For RSEQ, don't invoke rseq_sched_switch_event() on the + * caller. User code is not supposed to issue syscalls at + * all from inside an rseq critical section. */ if (flags != MEMBARRIER_FLAG_SYNC_CORE) { preempt_disable(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7936d4333731..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } static int find_lowest_rq(struct task_struct *task); @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we @@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = { .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d7..8590113e4a60 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -20,7 +21,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * naturally thottled to once per period, avoiding high context switch * workloads from spamming the hrtimer program/cancel paths. */ +extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); @@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -extern void dl_server_update_idle_time(struct rq *rq, - struct task_struct *p); extern void fair_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, @@ -682,10 +681,10 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 min_vruntime; + u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; - u64 min_vruntime_fi; + u64 zero_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -780,7 +779,6 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ @@ -1120,6 +1118,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1349,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) @@ -1432,6 +1438,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { idle_core = false; @@ -1443,7 +1452,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return idle_core; } static inline bool sched_group_cookie_match(struct rq *rq, @@ -1827,7 +1836,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1839,8 +1849,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1849,6 +1858,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -2209,6 +2223,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; + rseq_sched_set_ids_changed(p); #endif /* CONFIG_SMP */ } @@ -2342,8 +2357,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2356,34 +2370,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ -#define DEQUEUE_THROTTLE 0x800 - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#define ENQUEUE_MIGRATED 0x40 -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2400,16 +2426,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2419,55 +2490,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2579,8 +2718,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2610,7 +2750,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -3540,283 +3680,212 @@ extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID -#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ -#define MM_CID_SCAN_DELAY 100 /* 100ms */ +static __always_inline bool cid_on_cpu(unsigned int cid) +{ + return cid & MM_CID_ONCPU; +} -extern raw_spinlock_t cid_lock; -extern int use_cid_lock; +static __always_inline bool cid_in_transit(unsigned int cid) +{ + return cid & MM_CID_TRANSIT; +} -extern void sched_mm_cid_migrate_from(struct task_struct *t); -extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); -extern void init_sched_mm_cid(struct task_struct *t); +static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) +{ + return cid & ~MM_CID_ONCPU; +} -static inline void __mm_cid_put(struct mm_struct *mm, int cid) +static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) { - if (cid < 0) - return; - cpumask_clear_cpu(cid, mm_cidmask(mm)); + return cid | MM_CID_ONCPU; } -/* - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to - * be held to transition to other states. - * - * State transitions synchronized with cmpxchg or try_cmpxchg need to be - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. - */ -static inline void mm_cid_put_lazy(struct task_struct *t) +static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) { - struct mm_struct *mm = t->mm; - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + return cid | MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - if (!mm_cid_is_lazy_put(cid) || - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); +static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) +{ + return cid & ~MM_CID_TRANSIT; } -static inline int mm_cid_pcpu_unset(struct mm_struct *mm) +static __always_inline bool cid_on_task(unsigned int cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, res; + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ + return cid < MM_CID_TRANSIT; +} - lockdep_assert_irqs_disabled(); - cid = __this_cpu_read(pcpu_cid->cid); - for (;;) { - if (mm_cid_is_unset(cid)) - return MM_CID_UNSET; - /* - * Attempt transition from valid or lazy-put to unset. - */ - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); - if (res == cid) - break; - cid = res; - } - return cid; +static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) +{ + clear_bit(cid, mm_cidmask(mm)); } -static inline void mm_cid_put(struct mm_struct *mm) +static __always_inline void mm_unset_cid_on_task(struct task_struct *t) { - int cid; + unsigned int cid = t->mm_cid.cid; - lockdep_assert_irqs_disabled(); - cid = mm_cid_pcpu_unset(mm); - if (cid == MM_CID_UNSET) - return; - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + t->mm_cid.cid = MM_CID_UNSET; + if (cid_on_task(cid)) + mm_drop_cid(t->mm, cid); } -static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) +static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { - struct cpumask *cidmask = mm_cidmask(mm); - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid, max_nr_cid, allowed_max_nr_cid; + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); +} - /* - * After shrinking the number of threads or reducing the number - * of allowed cpus, reduce the value of max_nr_cid so expansion - * of cid allocation will preserve cache locality if the number - * of threads or allowed cpus increase again. - */ - max_nr_cid = atomic_read(&mm->max_nr_cid); - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), - atomic_read(&mm->mm_users))), - max_nr_cid > allowed_max_nr_cid) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { - max_nr_cid = allowed_max_nr_cid; - break; - } - } - /* Try to re-use recent cid. This improves cache locality. */ - cid = __this_cpu_read(pcpu_cid->recent_cid); - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && - !cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - /* - * Expand cid allocation if the maximum number of concurrency - * IDs allocated (max_nr_cid) is below the number cpus allowed - * and number of threads. Expanding cid allocation as much as - * possible improves cache locality. - */ - cid = max_nr_cid; - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) - continue; - if (!cpumask_test_and_set_cpu(cid, cidmask)) - return cid; - } - /* - * Find the first available concurrency id. - * Retry finding first zero bit if the mask is temporarily - * filled. This only happens during concurrent remote-clear - * which owns a cid without holding a rq lock. - */ - for (;;) { - cid = cpumask_first_zero(cidmask); - if (cid < READ_ONCE(mm->nr_cpus_allowed)) - break; - cpu_relax(); - } - if (cpumask_test_and_set_cpu(cid, cidmask)) - return -1; +static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) +{ + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); + if (cid >= max_cids) + return MM_CID_UNSET; + if (test_and_set_bit(cid, mm_cidmask(mm))) + return MM_CID_UNSET; return cid; } -/* - * Save a snapshot of the current runqueue time of this cpu - * with the per-cpu cid value, allowing to estimate how recently it was used. - */ -static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) +static inline unsigned int mm_get_cid(struct mm_struct *mm) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); - lockdep_assert_rq_held(rq); - WRITE_ONCE(pcpu_cid->time, rq->clock); + while (cid == MM_CID_UNSET) { + cpu_relax(); + cid = __mm_get_cid(mm, num_possible_cpus()); + } + return cid; } -static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, + unsigned int max_cids) { - int cid; + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); - /* - * All allocations (even those using the cid_lock) are lock-free. If - * use_cid_lock is set, hold the cid_lock to perform cid allocation to - * guarantee forward progress. - */ - if (!READ_ONCE(use_cid_lock)) { - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto end; - raw_spin_lock(&cid_lock); - } else { - raw_spin_lock(&cid_lock); - cid = __mm_cid_try_get(t, mm); - if (cid >= 0) - goto unlock; + /* Is it in the optimal CID space? */ + if (likely(cid < max_cids)) + return orig_cid; + + /* Try to find one in the optimal space. Otherwise keep the provided. */ + new_cid = __mm_get_cid(mm, max_cids); + if (new_cid != MM_CID_UNSET) { + mm_drop_cid(mm, cid); + /* Preserve the ONCPU mode of the original CID */ + return new_cid | (orig_cid & MM_CID_ONCPU); } + return orig_cid; +} - /* - * cid concurrently allocated. Retry while forcing following - * allocations to use the cid_lock to ensure forward progress. - */ - WRITE_ONCE(use_cid_lock, 1); - /* - * Set use_cid_lock before allocation. Only care about program order - * because this is only required for forward progress. - */ - barrier(); - /* - * Retry until it succeeds. It is guaranteed to eventually succeed once - * all newcoming allocations observe the use_cid_lock flag set. - */ - do { - cid = __mm_cid_try_get(t, mm); - cpu_relax(); - } while (cid < 0); - /* - * Allocate before clearing use_cid_lock. Only care about - * program order because this is for forward progress. - */ - barrier(); - WRITE_ONCE(use_cid_lock, 0); -unlock: - raw_spin_unlock(&cid_lock); -end: - mm_cid_snapshot_time(rq, mm); +static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) +{ + if (t->mm_cid.cid != cid) { + t->mm_cid.cid = cid; + rseq_sched_set_ids_changed(t); + } +} - return cid; +static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) +{ + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); } -static inline int mm_cid_get(struct rq *rq, struct task_struct *t, - struct mm_struct *mm) +static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) { - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid; + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; - lockdep_assert_rq_held(rq); - cid = __this_cpu_read(pcpu_cid->cid); - if (mm_cid_is_valid(cid)) { - mm_cid_snapshot_time(rq, mm); - return cid; - } - if (mm_cid_is_lazy_put(cid)) { - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case where both have the ONCPU bit set */ + if (likely(cid_on_cpu(cpu_cid & tcid))) { + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { + mm_cid_update_task_cid(t, cpu_cid); + return; + } + /* Try to converge into the optimal CID space */ + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); + } else { + /* Hand over or drop the task owned CID */ + if (cid_on_task(tcid)) { + if (cid_on_cpu(cpu_cid)) + mm_unset_cid_on_task(t); + else + cpu_cid = cid_to_cpu_cid(tcid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_cpu(cpu_cid)) + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); } - cid = __mm_cid_get(rq, t, mm); - __this_cpu_write(pcpu_cid->cid, cid); - __this_cpu_write(pcpu_cid->recent_cid, cid); - - return cid; + mm_cid_update_pcpu_cid(mm, cpu_cid); + mm_cid_update_task_cid(t, cpu_cid); } -static inline void switch_mm_cid(struct rq *rq, - struct task_struct *prev, - struct task_struct *next) +static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) { - /* - * Provide a memory barrier between rq->curr store and load of - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. - * - * Should be adapted if context_switch() is modified. - */ - if (!next->mm) { // to kernel - /* - * user -> kernel transition does not guarantee a barrier, but - * we can use the fact that it performs an atomic operation in - * mmgrab(). - */ - if (prev->mm) // from user - smp_mb__after_mmgrab(); - /* - * kernel -> kernel transition does not change rq->curr->mm - * state. It stays NULL. - */ - } else { // to user - /* - * kernel -> user transition does not provide a barrier - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. - * Provide it here. - */ - if (!prev->mm) { // from kernel - smp_mb(); - } else { // from user - /* - * user->user transition relies on an implicit - * memory barrier in switch_mm() when - * current->mm changes. If the architecture - * switch_mm() does not have an implicit memory - * barrier, it is emitted here. If current->mm - * is unchanged, no barrier is needed. - */ - smp_mb__after_switch_mm(); + unsigned int max_cids, tcid = t->mm_cid.cid; + struct mm_struct *mm = t->mm; + + max_cids = READ_ONCE(mm->mm_cid.max_cids); + /* Optimize for the common case, where both have the ONCPU bit clear */ + if (likely(cid_on_task(tcid | cpu_cid))) { + if (likely(tcid < max_cids)) { + mm_cid_update_pcpu_cid(mm, tcid); + return; } + /* Try to converge into the optimal CID space */ + tcid = mm_cid_converge(mm, tcid, max_cids); + } else { + /* Hand over or drop the CPU owned CID */ + if (cid_on_cpu(cpu_cid)) { + if (cid_on_task(tcid)) + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); + else + tcid = cpu_cid_to_cid(cpu_cid); + } + /* Still nothing, allocate a new one */ + if (!cid_on_task(tcid)) + tcid = mm_get_cid(mm); + /* Set the transition mode flag if required */ + tcid |= READ_ONCE(mm->mm_cid.transit); } - if (prev->mm_cid_active) { - mm_cid_snapshot_time(rq, prev->mm); - mm_cid_put_lazy(prev); - prev->mm_cid = -1; - } - if (next->mm_cid_active) - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); + mm_cid_update_pcpu_cid(mm, tcid); + mm_cid_update_task_cid(t, tcid); +} + +static __always_inline void mm_cid_schedin(struct task_struct *next) +{ + struct mm_struct *mm = next->mm; + unsigned int cpu_cid; + + if (!next->mm_cid.active) + return; + + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); + if (likely(!READ_ONCE(mm->mm_cid.percpu))) + mm_cid_from_task(next, cpu_cid); + else + mm_cid_from_cpu(next, cpu_cid); +} + +static __always_inline void mm_cid_schedout(struct task_struct *prev) +{ + /* During mode transitions CIDs are temporary and need to be dropped */ + if (likely(!cid_in_transit(prev->mm_cid.cid))) + return; + + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); + prev->mm_cid.cid = MM_CID_UNSET; +} + +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) +{ + mm_cid_schedout(prev); + mm_cid_schedin(next); } #else /* !CONFIG_SCHED_MM_CID: */ -static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } -static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } -static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } -static inline void init_sched_mm_cid(struct task_struct *t) { } +static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ extern u64 avg_vruntime(struct cfs_rq *cfs_rq); @@ -3875,32 +3944,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. */ -struct sched_enq_and_set_ctx { + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. + */ +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 26f3fd4d34ce..cbf7206b3f9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 2d4e279f05ee..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = { .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..0496dc29ed0f 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -695,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); @@ -856,6 +824,19 @@ void sched_set_fifo_low(struct task_struct *p) } EXPORT_SYMBOL_GPL(sched_set_fifo_low); +/* + * Used when the primary interrupt handler is forced into a thread, in addition + * to the (always threaded) secondary handler. The secondary handler gets a + * slightly lower priority so that the primary handler can preempt it, thereby + * emulating the behavior of a non-PREEMPT_RT system where the primary handler + * runs in hard interrupt context. + */ +void sched_set_fifo_secondary(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 - 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} + void sched_set_normal(struct task_struct *p, int nice) { struct sched_attr attr = { @@ -1351,7 +1332,7 @@ static void do_sched_yield(void) rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + rq->donor->sched_class->yield_task(rq); preempt_disable(); rq_unlock_irq(rq, &rf); @@ -1420,12 +1401,13 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { - struct task_struct *curr = current; + struct task_struct *curr; struct rq *rq, *p_rq; int yielded = 0; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); + curr = rq->donor; again: p_rq = task_rq(p); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; #endif /* CONFIG_NUMA */ @@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, @@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; - bitmap_free(distance_map); + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; + + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node) sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2083,6 +2151,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } diff --git a/kernel/signal.c b/kernel/signal.c index fe9190d84f28..e42b8bd6922f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ - sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aa062b8a55..f8ea8c8fc895 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; @@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5b6997f4dc3d..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.ns_type = ns_common_type(&init_time_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_time_ns), - .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2e5b89d7d866..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..80a8a09a21a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. @@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..ffee943d796d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..8ddf74e705d3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -1152,16 +1174,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3a4d3b2e3f74..4790da895203 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = { static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; if (!tko) - return -ENOMEM; + return ret; auxo = kobject_create_and_add("aux_clocks", tko); - if (!auxo) { - kobject_put(tko); - return -ENOMEM; - } + if (!auxo) + goto err_clean; for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); - if (!clk) - return -ENOMEM; - - int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) - return ret; + goto err_clean; } return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; } late_initcall(tk_aux_sysfs_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..1f2364126894 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -2472,7 +2473,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c0c54dc5314c..18dda1aa782d 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) || + cpuset_cpu_is_isolated(cpu)) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE); } /* @@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +553,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -708,7 +747,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1432,38 +1461,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1471,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + if (tmigr_is_isolated(cpu)) + return 0; + + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } return 0; } +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + lockdep_assert_cpus_held(); + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + return 0; +} + +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + guard(cpus_read_lock)(); + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1498,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); - /* - * If this is a new top-level, prepare its groupmask in advance. - * This avoids accidents where yet another new top-level is - * created in the future and made visible before the current groupmask. - */ - if (list_empty(&tmigr_level_list[lvl])) { - group->groupmask = BIT(0); - /* - * The previous top level has prepared its groupmask already, - * simply account it as the first child. - */ - if (lvl > 0) - group->num_children = 1; - } - timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1520,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1567,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { - struct tmigr_walk data; - - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (activate) { + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { /* - * @child is the old top and @parent the new one. In this - * case groupmask is pre-initialized and @child already - * accounted, along with its new sibling corresponding to the - * CPU going up. + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. */ - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); } else { - /* Adding @child for the CPU going up to @parent. */ + /* Common case adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } @@ -1596,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); - trace_tmigr_connect_child_parent(child); - - if (!activate) - return; - - /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. - */ - data.childmask = child->groupmask; - - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) + if (group->parent) break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } - } while (i < tmigr_hierarchy_levels); - - /* Assert single root */ - WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1692,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1705,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; + if (activate) { + struct tmigr_walk data; + union tmigr_state state; /* - * Newly created root level should have accounted the upcoming - * CPU's child group and pre-accounted the old root. + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, as this setup path is + * executed in hotplug prepare callback. This is executed by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - if (group->num_children == 2 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(!state.active); + WARN_ON_ONCE(!start->parent); + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } - tmigr_connect_child_parent(child, group, true); - } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); } + tmigr_root = group; } - +out: kfree(stack); return err; @@ -1748,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1798,6 +1956,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1847,7 +2010,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index ae19f70f8170..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..59cfacb8a5bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -5953,6 +5966,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -6048,6 +6072,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6106,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6115,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6106,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6127,12 +6148,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6147,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1244d2c5c384..afcd3747264d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7344,6 +7344,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..304e93597126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1d536219b624..6bfaf1210dd2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3272,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c428dafe7496..b15854c75d4f 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); - - revert_creds(old_cred); - put_cred(cred); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index ad9d6347b5fa..8001dbf16891 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -106,13 +106,14 @@ static struct tracepoint_user *__tracepoint_user_init(const char *name, struct t if (!tuser->name) return NULL; + /* Register tracepoint if it is loaded. */ if (tpoint) { + tuser->tpoint = tpoint; ret = tracepoint_user_register(tuser); if (ret) return ERR_PTR(ret); } - tuser->tpoint = tpoint; tuser->refcount = 1; INIT_LIST_HEAD(&tuser->list); list_add(&tuser->list, &tracepoint_user_list); @@ -1513,6 +1514,10 @@ static int disable_trace_fprobe(struct trace_event_call *call, if (!trace_probe_is_enabled(tp)) { list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } } } diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index dc6040aae3ee..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; + lockdep_assert_irqs_disabled(); + if (info->id.cpu) return info->id.id; @@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache = info->cache; trace->entries = cache->entries; - - if (cache->nr_entries) { - /* - * The user stack has already been previously unwound in this - * entry context. Skip the unwind and use the cache. - */ - trace->nr = cache->nr_entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) return 0; - } - trace->nr = 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; @@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) - return -EINVAL; + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); @@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) } /* The work has been claimed, now schedule it. */ - ret = task_work_add(current, &info->work, TWA_RESUME); + ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 97a8415e3216..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,18 +8,28 @@ #include <linux/unwind_user.h> #include <linux/uaccess.h> -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) -static int unwind_user_next_fp(struct unwind_user_state *state) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame *frame = &fp_frame; unsigned long cfa, fp, ra; - unsigned int shift; if (frame->use_fp) { if (state->fp < state->sp) @@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Make sure that the address is word aligned */ - shift = sizeof(long) == 4 ? 2 : 3; - if (cfa & ((1 << shift) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0163665914c9..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { @@ -65,14 +66,8 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.ns_type = ns_common_type(&init_user_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = ns_init_inum(&init_user_ns), -#ifdef CONFIG_USER_NS - .ns.ops = &userns_operations, -#endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3034e294d50d..742b23ef0d8b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -206,6 +206,16 @@ config DEBUG_BUGVERBOSE of the BUG call as well as the EIP and oops trace. This aids debugging but costs about 70-100K of memory. +config DEBUG_BUGVERBOSE_DETAILED + bool "Verbose WARN_ON_ONCE() reporting (adds 100K)" if DEBUG_BUGVERBOSE + help + Say Y here to make WARN_ON_ONCE() output the condition string of the + warning, in addition to the file name and line number. + This helps debugging, but costs about 100K of memory. + + Say N if unsure. + + endmenu # "printk and dmesg options" config DEBUG_KERNEL @@ -445,8 +455,7 @@ config FRAME_WARN default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 2048 if PARISC default 1536 if (!64BIT && XTENSA) - default 1280 if KASAN && !64BIT - default 1024 if !64BIT + default 1280 if !64BIT default 2048 if 64BIT help Tell the compiler to warn at build time for stack frames larger than this. diff --git a/lib/bitmap.c b/lib/bitmap.c index b97692854966..9dc526507875 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -355,6 +355,12 @@ unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_weight_andnot); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int bits) +{ + return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); +} + void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); diff --git a/lib/bug.c b/lib/bug.c index b1f07459c2ee..edd9041f89f3 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -139,6 +139,29 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, #endif } +static const char *bug_get_format(struct bug_entry *bug) +{ + const char *format = NULL; +#ifdef HAVE_ARCH_BUG_FORMAT +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + /* + * Allow an architecture to: + * - relative encode NULL (difficult vs KASLR); + * - use a literal 0 (there are no valid objects inside + * the __bug_table itself to refer to after all); + * - use an empty string. + */ + if (bug->format_disp) + format = (const char *)&bug->format_disp + bug->format_disp; + if (format && format[0] == '\0') + format = NULL; +#else + format = bug->format; +#endif +#endif + return format; +} + struct bug_entry *find_bug(unsigned long bugaddr) { struct bug_entry *bug; @@ -150,26 +173,51 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); } -static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs) +static void __warn_printf(const char *fmt, struct pt_regs *regs) { - struct bug_entry *bug; - const char *file; - unsigned line, warning, once, done; + if (!fmt) + return; + +#ifdef HAVE_ARCH_BUG_FORMAT_ARGS + if (regs) { + struct arch_va_list _args; + va_list *args = __warn_args(&_args, regs); + + if (args) { + vprintk(fmt, *args); + return; + } + } +#endif + + printk("%s", fmt); +} - if (!is_valid_bugaddr(bugaddr)) - return BUG_TRAP_TYPE_NONE; +static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs) +{ + bool warning, once, done, no_cut, has_args; + const char *file, *fmt; + unsigned line; + + if (!bug) { + if (!is_valid_bugaddr(bugaddr)) + return BUG_TRAP_TYPE_NONE; - bug = find_bug(bugaddr); - if (!bug) - return BUG_TRAP_TYPE_NONE; + bug = find_bug(bugaddr); + if (!bug) + return BUG_TRAP_TYPE_NONE; + } disable_trace_on_warning(); bug_get_file_line(bug, &file, &line); + fmt = bug_get_format(bug); - warning = (bug->flags & BUGFLAG_WARNING) != 0; - once = (bug->flags & BUGFLAG_ONCE) != 0; - done = (bug->flags & BUGFLAG_DONE) != 0; + warning = bug->flags & BUGFLAG_WARNING; + once = bug->flags & BUGFLAG_ONCE; + done = bug->flags & BUGFLAG_DONE; + no_cut = bug->flags & BUGFLAG_NO_CUT_HERE; + has_args = bug->flags & BUGFLAG_ARGS; if (warning && once) { if (done) @@ -187,8 +235,10 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re * "cut here" line now. WARN() issues its own "cut here" before the * extra debugging message it writes before triggering the handler. */ - if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0) + if (!no_cut) { printk(KERN_DEFAULT CUT_HERE); + __warn_printf(fmt, has_args ? regs : NULL); + } if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ @@ -206,13 +256,25 @@ static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *re return BUG_TRAP_TYPE_BUG; } +enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs) +{ + enum bug_trap_type ret; + bool rcu = false; + + rcu = warn_rcu_enter(); + ret = __report_bug(bug, 0, regs); + warn_rcu_exit(rcu); + + return ret; +} + enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) { enum bug_trap_type ret; bool rcu = false; rcu = warn_rcu_enter(); - ret = __report_bug(bugaddr, regs); + ret = __report_bug(NULL, bugaddr, regs); warn_rcu_exit(rcu); return ret; diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index 8886055e938f..16859c6226dd 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -64,7 +64,7 @@ config CRYPTO_LIB_CURVE25519 config CRYPTO_LIB_CURVE25519_ARCH bool depends on CRYPTO_LIB_CURVE25519 && !UML && !KMSAN - default y if ARM && KERNEL_MODE_NEON + default y if ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN default y if PPC64 && CPU_LITTLE_ENDIAN default y if X86_64 diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index bded351aeace..d2845b214585 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -90,7 +90,7 @@ else libcurve25519-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += curve25519-fiat32.o endif # clang versions prior to 18 may blow out the stack with KASAN -ifeq ($(call clang-min-version, 180000),) +ifeq ($(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_) KASAN_SANITIZE_curve25519-hacl64.o := n endif diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c index dcedfca06df6..5dccdee79693 100644 --- a/lib/crypto/tests/sha256_kunit.c +++ b/lib/crypto/tests/sha256_kunit.c @@ -68,6 +68,7 @@ static void test_sha256_finup_2x(struct kunit *test) rand_bytes(data1_buf, max_data_len); rand_bytes(data2_buf, max_data_len); rand_bytes(salt, sizeof(salt)); + memset(ctx, 0, sizeof(*ctx)); for (size_t i = 0; i < 500; i++) { size_t salt_len = rand_length(sizeof(salt)); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 7f50c4480a4e..ecf8e7f978e3 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -714,13 +714,13 @@ static void debug_objects_fill_pool(void) * raw_spinlock_t are basically the same type and this lock-type * inversion works just fine. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible() || system_state < SYSTEM_SCHEDULING) { /* * Annotate away the spinlock_t inside raw_spinlock_t warning - * by temporarily raising the wait-type to WAIT_SLEEP, matching + * by temporarily raising the wait-type to LD_WAIT_CONFIG, matching * the preemptible() condition above. */ - static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_CONFIG); lock_map_acquire_try(&fill_pool_map); fill_pool(); lock_map_release(&fill_pool_map); diff --git a/lib/interval_tree.c b/lib/interval_tree.c index 324766e9bf63..9ceb084b6b4e 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -13,6 +13,7 @@ INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); +EXPORT_SYMBOL_GPL(interval_tree_subtree_search); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 2fe66a6b8789..896760bad455 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -49,12 +49,24 @@ size_t copy_from_user_iter(void __user *iter_from, size_t progress, if (should_fail_usercopy()) return len; - if (access_ok(iter_from, len)) { - to += progress; - instrument_copy_from_user_before(to, iter_from, len); - res = raw_copy_from_user(to, iter_from, len); - instrument_copy_from_user_after(to, iter_from, len, res); + if (can_do_masked_user_access()) { + iter_from = mask_user_address(iter_from); + } else { + if (!access_ok(iter_from, len)) + return res; + + /* + * Ensure that bad access_ok() speculation will not + * lead to nasty side effects *after* the copy is + * finished: + */ + barrier_nospec(); } + to += progress; + instrument_copy_from_user_before(to, iter_from, len); + res = raw_copy_from_user(to, iter_from, len); + instrument_copy_from_user_after(to, iter_from, len, res); + return res; } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index ed99344317f5..d939403331b5 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -202,7 +202,7 @@ static void init_shared_classes(void) local_irq_disable(); \ __irq_enter(); \ lockdep_hardirq_threaded(); \ - WARN_ON(!in_irq()); + WARN_ON(!in_hardirq()); #define HARDIRQ_EXIT() \ __irq_exit(); \ @@ -2512,7 +2512,7 @@ DEFINE_LOCK_GUARD_0(NOTTHREADED_HARDIRQ, do { local_irq_disable(); __irq_enter(); - WARN_ON(!in_irq()); + WARN_ON(!in_hardirq()); } while(0), HARDIRQ_EXIT()) DEFINE_LOCK_GUARD_0(SOFTIRQ, SOFTIRQ_ENTER(), SOFTIRQ_EXIT()) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 39bb779cb311..5aa4c9500018 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -64,6 +64,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> +#define TP_FCT tracepoint_string(__func__) + /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. @@ -2756,7 +2758,7 @@ static inline void mas_rebalance(struct ma_state *mas, MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced @@ -2997,7 +2999,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); mast.l = &l_mas; mast.r = &r_mas; @@ -3172,7 +3174,7 @@ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) return false; } - trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); + trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry); return true; } @@ -3416,7 +3418,7 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) * of data may happen. */ mas = wr_mas->mas; - trace_ma_op(__func__, mas); + trace_ma_op(TP_FCT, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); @@ -3552,7 +3554,7 @@ done: } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } - trace_ma_write(__func__, mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; @@ -3596,7 +3598,7 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) mas->offset++; /* Keep mas accurate. */ } - trace_ma_write(__func__, mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. @@ -3717,7 +3719,7 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas, mas_update_gap(mas); mas->end = new_end; - trace_ma_write(__func__, mas, new_end, wr_mas->entry); + trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); return; } @@ -3731,7 +3733,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; - trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); + trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); @@ -5062,7 +5064,7 @@ void *mas_store(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); - trace_ma_write(__func__, mas, 0, entry); + trace_ma_write(TP_FCT, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, @@ -5163,7 +5165,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) } store: - trace_ma_write(__func__, mas, 0, entry); + trace_ma_write(TP_FCT, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); @@ -5882,7 +5884,7 @@ void *mtree_load(struct maple_tree *mt, unsigned long index) MA_STATE(mas, mt, index, index); void *entry; - trace_ma_read(__func__, &mas); + trace_ma_read(TP_FCT, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); @@ -5925,7 +5927,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, MA_STATE(mas, mt, index, last); int ret = 0; - trace_ma_write(__func__, &mas, 0, entry); + trace_ma_write(TP_FCT, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; @@ -6148,7 +6150,7 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) void *entry = NULL; MA_STATE(mas, mt, index, index); - trace_ma_op(__func__, &mas); + trace_ma_op(TP_FCT, &mas); mtree_lock(mt); entry = mas_erase(&mas); @@ -6485,7 +6487,7 @@ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) unsigned long copy = *index; #endif - trace_ma_read(__func__, &mas); + trace_ma_read(TP_FCT, &mas); if ((*index) > max) return NULL; diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 6dc234913dd5..5bb752ff7c61 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -126,7 +126,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (can_do_masked_user_access()) { long retval; - src = masked_user_access_begin(src); + src = masked_user_read_access_begin(src); retval = do_strncpy_from_user(dst, src, count, count); user_read_access_end(); return retval; diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 6e489f9e90f1..4a6574b67f82 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -99,7 +99,7 @@ long strnlen_user(const char __user *str, long count) if (can_do_masked_user_access()) { long retval; - str = masked_user_access_begin(str); + str = masked_user_read_access_begin(str); retval = do_strnlen_user(str, count, count); user_read_access_end(); return retval; diff --git a/lib/test_kho.c b/lib/test_kho.c index 60cd899ea745..fff018e5548d 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -301,6 +301,9 @@ static int __init kho_test_init(void) phys_addr_t fdt_phys; int err; + if (!kho_is_enabled()) + return 0; + err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); if (!err) return kho_test_restore(fdt_phys); diff --git a/mm/Kconfig b/mm/Kconfig index 0e26f4fc8717..ca3f146bc705 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -908,6 +908,13 @@ config PAGE_MAPCOUNT config PGTABLE_HAS_HUGE_LEAVES def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE +# +# We can end up creating gigantic folio. +# +config HAVE_GIGANTIC_FOLIOS + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + # TODO: Allow to be enabled without THP config ARCH_SUPPORTS_HUGE_PFNMAP def_bool n diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 41b6c9386b69..c5740c6d37a2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -72,7 +72,7 @@ static void collect_wb_stats(struct wb_stats *stats, list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) - if (inode->i_state & I_DIRTY_TIME) + if (inode_state_read_once(inode) & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); diff --git a/mm/damon/stat.c b/mm/damon/stat.c index d8010968bbed..bf8626859902 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us, static struct damon_ctx *damon_stat_context; +static unsigned long damon_stat_last_refresh_jiffies; + static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) { struct damon_target *t; @@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) static int damon_stat_damon_call_fn(void *data) { struct damon_ctx *c = data; - static unsigned long last_refresh_jiffies; /* avoid unnecessarily frequent stat update */ - if (time_before_eq(jiffies, last_refresh_jiffies + + if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies + msecs_to_jiffies(5 * MSEC_PER_SEC))) return 0; - last_refresh_jiffies = jiffies; + damon_stat_last_refresh_jiffies = jiffies; aggr_interval_us = c->attrs.aggr_interval; damon_stat_set_estimated_memory_bandwidth(c); @@ -210,6 +211,8 @@ static int damon_stat_start(void) err = damon_start(&damon_stat_context, 1, true); if (err) return err; + + damon_stat_last_refresh_jiffies = jiffies; call_control.data = damon_stat_context; return damon_call(damon_stat_context, &call_control); } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cd6815ecc04e..3c0d727788c8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1552,16 +1552,17 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ctx; } +static unsigned long damon_sysfs_next_update_jiffies; + static int damon_sysfs_repeat_call_fn(void *data) { struct damon_sysfs_kdamond *sysfs_kdamond = data; - static unsigned long next_update_jiffies; if (!sysfs_kdamond->refresh_ms) return 0; - if (time_before(jiffies, next_update_jiffies)) + if (time_before(jiffies, damon_sysfs_next_update_jiffies)) return 0; - next_update_jiffies = jiffies + + damon_sysfs_next_update_jiffies = jiffies + msecs_to_jiffies(sysfs_kdamond->refresh_ms); if (!mutex_trylock(&damon_sysfs_lock)) @@ -1607,6 +1608,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) } kdamond->damon_ctx = ctx; + damon_sysfs_next_update_jiffies = + jiffies + msecs_to_jiffies(kdamond->refresh_ms); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; repeat_call_control->data = kdamond; repeat_call_control->repeat = true; diff --git a/mm/fadvise.c b/mm/fadvise.c index 588fe76c5a14..67028e30aa91 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_DONTNEED: - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + filemap_flush_range(mapping, offset, endbyte); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..dfc8a31f1222 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -256,7 +256,7 @@ void filemap_remove_folio(struct folio *folio) __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); @@ -335,7 +335,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) @@ -366,83 +366,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite_range); - /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -452,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -470,10 +447,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check @@ -691,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -794,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); @@ -2366,6 +2353,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -3681,8 +3726,10 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + pgoff_t file_end) { + struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3691,12 +3738,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr0; /* - * Map the large folio fully where possible. + * Map the large folio fully where possible: * - * The folio must not cross VMA or page table boundary. + * - The folio is fully within size of the file or belong + * to shmem/tmpfs; + * - The folio doesn't cross VMA boundary; + * - The folio doesn't cross page table boundary; */ addr0 = addr - start * PAGE_SIZE; - if (folio_within_vma(folio, vmf->vma) && + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; @@ -3817,7 +3868,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + /* + * Do not allow to map with PMD across i_size to preserve + * SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3830,10 +3892,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - if (end_pgoff > file_end) - end_pgoff = file_end; - folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3850,7 +3908,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -4457,16 +4515,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1d1b74950332..6cba1cb14b23 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -214,7 +214,8 @@ retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; - zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & + ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); @@ -3263,6 +3264,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) caller_pins; } +static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) +{ + for (; nr_pages; page++, nr_pages--) + if (PageHWPoison(page)) + return true; + return false; +} + /* * It splits @folio into @new_order folios and copies the @folio metadata to * all the resulting folios. @@ -3270,17 +3279,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) static void __split_folio_to_order(struct folio *folio, int old_order, int new_order) { + /* Scan poisoned pages when split a poisoned folio to large folios */ + const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; long new_nr_pages = 1 << new_order; long nr_pages = 1 << old_order; long i; + folio_clear_has_hwpoisoned(folio); + + /* Check first new_nr_pages since the loop below skips them */ + if (handle_hwpoison && + page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) + folio_set_has_hwpoisoned(folio); /* * Skip the first new_nr_pages, since the new folio from them have all * the flags from the original folio. */ for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { struct page *new_head = &folio->page + i; - /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). @@ -3322,6 +3338,10 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); + if (handle_hwpoison && + page_range_has_hwpoisoned(new_head, new_nr_pages)) + folio_set_has_hwpoisoned(new_folio); + new_folio->mapping = folio->mapping; new_folio->index = folio->index + i; @@ -3422,8 +3442,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order, if (folio_test_anon(folio)) mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); - folio_clear_has_hwpoisoned(folio); - /* * split to new_order one order at a time. For uniform split, * folio is split to new_order directly. @@ -3504,7 +3522,8 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, /* order-1 is not supported for anonymous THP. */ VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; + if (new_order == 1) + return false; } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { /* @@ -3535,7 +3554,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order, if (folio_test_anon(folio)) { VM_WARN_ONCE(warns && new_order == 1, "Cannot split to order-1 folio"); - return new_order != 1; + if (new_order == 1) + return false; } else if (new_order) { if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { @@ -3599,6 +3619,16 @@ static int __folio_split(struct folio *folio, unsigned int new_order, if (folio != page_folio(split_at) || folio != page_folio(lock_at)) return -EINVAL; + /* + * Folios that just got truncated cannot get split. Signal to the + * caller that there was a race. + * + * TODO: this will also currently refuse shmem folios that are in the + * swapcache. + */ + if (!is_anon && !folio->mapping) + return -EBUSY; + if (new_order >= folio_order(folio)) return -EINVAL; @@ -3639,22 +3669,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, gfp_t gfp; mapping = folio->mapping; - - /* Truncated ? */ - /* - * TODO: add support for large shmem folio in swap cache. - * When shmem is in swap cache, mapping is NULL and - * folio_test_swapcache() is true. - */ - if (!mapping) { - ret = -EBUSY; - goto out; - } - min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", - min_order); ret = -EINVAL; goto out; } @@ -3986,12 +4002,7 @@ int min_order_for_split(struct folio *folio) int split_folio_to_list(struct folio *folio, struct list_head *list) { - int ret = min_order_for_split(folio); - - if (ret < 0) - return ret; - - return split_huge_page_to_list_to_order(&folio->page, list, ret); + return split_huge_page_to_list_to_order(&folio->page, list, 0); } /* diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 8bca7fece47f..35ceaa8adb41 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); - /* Don't sleep. */ - flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); - handle = stack_depot_save(entries, nr_entries, flags); return stack_depot_set_extra_bits(handle, extra); } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 2cee59d89c80..8f22d1f22981 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object) if (s->ctor) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + kmsan_internal_poison_memory(object, s->object_size, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr) kmsan_enter_runtime(); page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); - kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, + kmsan_internal_poison_memory((void *)ptr, page_size(page), + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 54f3c3c962f0..55fdea199aaf 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -208,7 +208,7 @@ void kmsan_free_page(struct page *page, unsigned int order) return; kmsan_enter_runtime(); kmsan_internal_poison_memory(page_address(page), page_size(page), - GFP_KERNEL, + GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); } @@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio, return true; } +struct ksm_next_page_arg { + struct folio *folio; + struct page *page; + unsigned long addr; +}; + +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct ksm_next_page_arg *private = walk->private; + struct vm_area_struct *vma = walk->vma; + pte_t *start_ptep = NULL, *ptep, pte; + struct mm_struct *mm = walk->mm; + struct folio *folio; + struct page *page; + spinlock_t *ptl; + pmd_t pmd; + + if (ksm_test_exit(mm)) + return 0; + + cond_resched(); + + pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + return 0; + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { + ptl = pmd_lock(mm, pmdp); + pmd = pmdp_get(pmdp); + + if (!pmd_present(pmd)) { + goto not_found_unlock; + } else if (pmd_leaf(pmd)) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + goto not_found_unlock; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + goto not_found_unlock; + + page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); + goto found_unlock; + } + spin_unlock(ptl); + } + + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!start_ptep) + return 0; + + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { + pte = ptep_get(ptep); + + if (!pte_present(pte)) + continue; + + page = vm_normal_page(vma, addr, pte); + if (!page) + continue; + folio = page_folio(page); + + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) + continue; + goto found_unlock; + } + +not_found_unlock: + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + return 0; +found_unlock: + folio_get(folio); + spin_unlock(ptl); + if (start_ptep) + pte_unmap(start_ptep); + private->page = page; + private->folio = folio; + private->addr = addr; + return 1; +} + +static struct mm_walk_ops ksm_next_page_ops = { + .pmd_entry = ksm_next_page_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; @@ -2542,21 +2631,27 @@ next_mm: ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { + struct ksm_next_page_arg ksm_next_page_arg; struct page *tmp_page = NULL; - struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); - if (folio) { - if (!folio_is_zone_device(folio) && - folio_test_anon(folio)) { - folio_get(folio); - tmp_page = fw.page; - } - folio_walk_end(&fw, vma); + int found; + + found = walk_page_range_vma(vma, ksm_scan.address, + vma->vm_end, + &ksm_next_page_ops, + &ksm_next_page_arg); + + if (found > 0) { + folio = ksm_next_page_arg.folio; + tmp_page = ksm_next_page_arg.page; + ksm_scan.address = ksm_next_page_arg.addr; + } else { + VM_WARN_ON_ONCE(found < 0); + ksm_scan.address = vma->vm_end - PAGE_SIZE; } if (tmp_page) { diff --git a/mm/memblock.c b/mm/memblock.c index e23e16618e9b..f0f2dc66e9a2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1826,7 +1826,8 @@ phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int n */ unsigned long __init memblock_estimated_nr_free_pages(void) { - return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size()); + return PHYS_PFN(memblock_phys_mem_size() - + memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE)); } /* lowest address */ diff --git a/mm/memfd.c b/mm/memfd.c index 1d109c1acf21..805e297916e5 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -96,9 +96,36 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) NULL, gfp_mask); if (folio) { + u32 hash; + + /* + * Zero the folio to prevent information leaks to userspace. + * Use folio_zero_user() which is optimized for huge/gigantic + * pages. Pass 0 as addr_hint since this is not a faulting path + * and we don't have a user virtual address yet. + */ + folio_zero_user(folio, 0); + + /* + * Mark the folio uptodate before adding to page cache, + * as required by filemap.c and other hugetlb paths. + */ + __folio_mark_uptodate(folio); + + /* + * Serialize hugepage allocation and instantiation to prevent + * races with concurrent allocations, as required by all other + * callers of hugetlb_add_to_page_cache(). + */ + hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (err) { folio_put(folio); goto err_unresv; @@ -470,9 +497,9 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct file *file; - int fd, error; - char *name; + char *name __free(kfree) = NULL; + unsigned int fd_flags; + int error; error = sanitize_flags(&flags); if (error < 0) @@ -482,25 +509,6 @@ SYSCALL_DEFINE2(memfd_create, if (IS_ERR(name)) return PTR_ERR(name); - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_free_name; - } - - file = alloc_file(name, flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_free_fd; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_free_fd: - put_unused_fd(fd); -err_free_name: - kfree(name); - return error; + fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; + return FD_ADD(fd_flags, alloc_file(name, flags)); } diff --git a/mm/memory.c b/mm/memory.c index 74b45e258323..b59ae7ce42eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/shmem_fs.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> @@ -5501,8 +5502,25 @@ fallback: return ret; } + if (!needs_fallback && vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t file_end; + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + needs_fallback = !shmem_mapping(mapping) && + file_end < folio_next_index(folio); + } + if (pmd_none(*vmf->pmd)) { - if (folio_test_pmd_mappable(folio)) { + if (!needs_fallback && folio_test_pmd_mappable(folio)) { ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; diff --git a/mm/mempool.c b/mm/mempool.c index 1c38e873e546..d7bbf1189db9 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -68,10 +68,20 @@ static void check_element(mempool_t *pool, void *element) } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __check_element(pool, addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __check_element(pool, addr, PAGE_SIZE << order); +#endif } } @@ -97,10 +107,20 @@ static void poison_element(mempool_t *pool, void *element) } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; - void *addr = kmap_local_page((struct page *)element); - __poison_element(addr, 1UL << (PAGE_SHIFT + order)); - kunmap_local(addr); +#ifdef CONFIG_HIGHMEM + for (int i = 0; i < (1 << order); i++) { + struct page *page = (struct page *)element; + void *addr = kmap_local_page(page + i); + + __poison_element(addr, PAGE_SIZE); + kunmap_local(addr); + } +#else + void *addr = page_address((struct page *)element); + + __poison_element(addr, PAGE_SIZE << order); +#endif } } #else /* CONFIG_SLUB_DEBUG_ON */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 3db2dea7db4c..7712d887b696 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename, panic("Failed to allocate %s hash table\n", tablename); pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + tablename, 1UL << log2qty, get_order(size), size, virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 0a0db5849b8e..42e3dde73e74 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -241,6 +241,7 @@ retry: if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ + mas_set(&mas, address); goto retry; } diff --git a/mm/mremap.c b/mm/mremap.c index bd7314898ec5..419a0ea0a870 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -187,7 +187,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, ptep, pte, max_nr); + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE); } static int move_ptes(struct pagetable_move_control *pmc, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 757bc4d3b5b5..a124ab6a205d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping, return true; } -static xa_mark_t wbc_to_tag(struct writeback_control *wbc) -{ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - return PAGECACHE_TAG_TOWRITE; - return PAGECACHE_TAG_DIRTY; -} static pgoff_t wbc_end(struct writeback_control *wbc) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..ed82ee55e66a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * If memory tags should be zeroed * (which happens only when memory should be initialized as well). */ - if (zero_tags) { - /* Initialize both memory and memory tags. */ - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); + if (zero_tags) + init = !tag_clear_highpages(page, 1 << order); - /* Take note that memory was initialized by the loop above. */ - init = false; - } if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ diff --git a/mm/secretmem.c b/mm/secretmem.c index 60137305bc20..f0ef4e198884 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -82,13 +82,13 @@ retry: __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { - folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(folio_page(folio, 0)); + folio_put(folio); if (err == -EEXIST) goto retry; @@ -224,9 +224,6 @@ err_free_inode: SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { - struct file *file; - int fd, err; - /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); @@ -238,22 +235,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) if (atomic_read(&secretmem_users) < 0) return -ENFILE; - fd = get_unused_fd_flags(flags & O_CLOEXEC); - if (fd < 0) - return fd; - - file = secretmem_file_create(flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto err_put_fd; - } - - fd_install(fd, file); - return fd; - -err_put_fd: - put_unused_fd(fd); - return err; + return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags)); } static int secretmem_init_fs_context(struct fs_context *fc) diff --git a/mm/shmem.c b/mm/shmem.c index b9081b817d28..899303d8c9aa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -131,8 +131,7 @@ struct shmem_options { #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 -#define SHMEM_SEEN_NOSWAP 16 -#define SHMEM_SEEN_QUOTA 32 +#define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1076,7 +1075,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; @@ -1133,7 +1132,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); @@ -1227,7 +1226,7 @@ whole_folios: shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -1882,6 +1881,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; + pgoff_t aligned_index; long pages; int error, order; @@ -1895,10 +1895,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; - index = round_down(index, pages); - folio = shmem_alloc_folio(gfp, order, info, index); - if (folio) + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; goto allocated; + } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); @@ -4677,7 +4679,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; - ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) @@ -4827,14 +4828,15 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Current inum too high to switch to 32-bit inums"; goto out; } - if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { + + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } - if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { - err = "Cannot enable swap on remount if it was disabled on first mount"; - goto out; - } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { @@ -5776,7 +5778,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, } #endif -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } diff --git a/mm/slub.c b/mm/slub.c index d4367f25b20d..a0b905c2a557 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2046,7 +2046,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) if (slab_exts) { unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); - /* codetag should be NULL */ + + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + return; + + /* codetag should be NULL here */ WARN_ON(slab_exts[offs].ref.ct); set_codetag_empty(&slab_exts[offs].ref); } @@ -4666,8 +4670,12 @@ new_objects: if (kmem_cache_debug(s)) { freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - if (unlikely(!freelist)) + if (unlikely(!freelist)) { + /* This could cause an endless loop. Fail instead. */ + if (!allow_spin) + return NULL; goto new_objects; + } if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr, @@ -6328,8 +6336,6 @@ next_remote_batch: if (unlikely(!slab_free_hook(s, p[i], init, false))) { p[i] = p[--size]; - if (!size) - goto flush_remote; continue; } @@ -6344,6 +6350,9 @@ next_remote_batch: i++; } + if (!size) + goto flush_remote; + next_batch: if (!local_trylock(&s->cpu_sheaves->lock)) goto fallback; @@ -6398,6 +6407,9 @@ do_free: goto next_batch; } + if (remote_nr) + goto flush_remote; + return; no_empty: diff --git a/mm/swap_state.c b/mm/swap_state.c index b13e9c4baa90..f4980dde5394 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -748,6 +748,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, blk_start_plug(&plug); for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { + struct swap_info_struct *si = NULL; + if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -761,8 +763,19 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; + /* + * Readahead entry may come from a device that we are not + * holding a reference to, try to grab a reference, or skip. + */ + if (swp_type(entry) != swp_type(targ_entry)) { + si = get_swap_device(entry); + if (!si) + continue; + } folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); + if (si) + put_swap_device(si); if (!folio) continue; if (page_allocated) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 10760240a3a2..a1b4b9d80e3b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2005,10 +2005,8 @@ swp_entry_t get_swap_page_of_type(int type) local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); local_unlock(&percpu_swap_cluster.lock); - if (offset) { + if (offset) entry = swp_entry(si->type, offset); - atomic_long_dec(&nr_swap_pages); - } } put_swap_device(si); } diff --git a/mm/truncate.c b/mm/truncate.c index 91eb92a5ce4f..12467c1bd711 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } @@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping, xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); @@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio) return 0; } +static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, + unsigned long min_order) +{ + enum ttu_flags ttu_flags = + TTU_SYNC | + TTU_SPLIT_HUGE_PMD | + TTU_IGNORE_MLOCK; + int ret; + + ret = try_folio_split_to_order(folio, split_at, min_order); + + /* + * If the split fails, unmap the folio, so it will be refaulted + * with PTEs to respect SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if (ret && !shmem_mapping(folio->mapping)) { + try_to_unmap(folio, ttu_flags); + WARN_ON(folio_mapped(folio)); + } + + return ret; +} + /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the @@ -194,6 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; + unsigned int min_order; if (pos < start) offset = start - pos; @@ -223,8 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_test_large(folio)) return true; + min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); - if (!try_folio_split(folio, split_at, NULL)) { + if (!try_folio_split_or_unmap(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste @@ -248,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!folio_trylock(folio2)) goto out; - /* - * make sure folio2 is large and does not change its mapping. - * Its split result does not matter here. - */ + /* make sure folio2 is large and does not change its mapping */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) - try_folio_split(folio2, split_at2, NULL); + try_folio_split_or_unmap(folio2, split_at2, min_order); folio_unlock(folio2); out: @@ -339,7 +364,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, - loff_t lstart, loff_t lend) + loff_t lstart, uoff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ @@ -387,7 +412,7 @@ void truncate_inode_pages_range(struct address_space *mapping, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) @@ -622,7 +647,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); diff --git a/mm/vmscan.c b/mm/vmscan.c index b2fc8b626d3d..bb4a96c7b682 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -811,7 +811,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, __filemap_remove_folio(folio, shadow); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); if (free_folio) diff --git a/mm/workingset.c b/mm/workingset.c index 68a76a91111f..d32dc2e02a61 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -755,7 +755,7 @@ out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fda3a80e9340..2b74ed56eb16 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -193,6 +193,8 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; + netdev_update_features(dev); + return 0; out_unregister_netdev: diff --git a/net/atm/common.c b/net/atm/common.c index 881c7f259dbd..c4edc1111bf0 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -881,7 +881,7 @@ out_atmproc_exit: out_atmsvc_exit: atmsvc_exit(); out_atmpvc_exit: - atmsvc_exit(); + atmpvc_exit(); out_unregister_vcc_proto: proto_unregister(&vcc_proto); goto out; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f0c862091bff..2c21ae8abadc 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -53,6 +53,11 @@ static bool enable_6lowpan; static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); +enum { + LOWPAN_PEER_CLOSING, + LOWPAN_PEER_MAXBITS +}; + struct lowpan_peer { struct list_head list; struct rcu_head rcu; @@ -61,6 +66,8 @@ struct lowpan_peer { /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; + + DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS); }; struct lowpan_btle_dev { @@ -289,6 +296,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; + skb_reset_mac_header(local_skb); skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { @@ -919,7 +927,9 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) BT_DBG("peer %p chan %p", peer, peer->chan); + l2cap_chan_lock(peer->chan); l2cap_chan_close(peer->chan, ENOENT); + l2cap_chan_unlock(peer->chan); return 0; } @@ -956,10 +966,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void) } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, - struct l2cap_conn **conn) + struct l2cap_conn **conn, bool disconnect) { struct hci_conn *hcon; struct hci_dev *hdev; + int le_addr_type; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", @@ -970,13 +981,32 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, if (n < 7) return -EINVAL; + if (disconnect) { + /* The "disconnect" debugfs command has used different address + * type constants than "connect" since 2015. Let's retain that + * for now even though it's obviously buggy... + */ + *addr_type += 1; + } + + switch (*addr_type) { + case BDADDR_LE_PUBLIC: + le_addr_type = ADDR_LE_DEV_PUBLIC; + break; + case BDADDR_LE_RANDOM: + le_addr_type = ADDR_LE_DEV_RANDOM; + break; + default: + return -EINVAL; + } + /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); - hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); + hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -993,41 +1023,52 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; - struct lowpan_peer *peer, *tmp_peer, *new_peer; - struct list_head peers; - - INIT_LIST_HEAD(&peers); + struct lowpan_peer *peer; + int nchans; - /* We make a separate list of peers as the close_cb() will - * modify the device peers list so it is better not to mess - * with the same list at the same time. + /* l2cap_chan_close() cannot be called from RCU, and lock ordering + * chan->lock > devices_lock prevents taking write side lock, so copy + * then close. */ rcu_read_lock(); + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) + list_for_each_entry_rcu(peer, &entry->peers, list) + clear_bit(LOWPAN_PEER_CLOSING, peer->flags); + rcu_read_unlock(); - list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - list_for_each_entry_rcu(peer, &entry->peers, list) { - new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); - if (!new_peer) - break; + do { + struct l2cap_chan *chans[32]; + int i; - new_peer->chan = peer->chan; - INIT_LIST_HEAD(&new_peer->list); + nchans = 0; - list_add(&new_peer->list, &peers); - } - } + spin_lock(&devices_lock); - rcu_read_unlock(); + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { + list_for_each_entry_rcu(peer, &entry->peers, list) { + if (test_and_set_bit(LOWPAN_PEER_CLOSING, + peer->flags)) + continue; - spin_lock(&devices_lock); - list_for_each_entry_safe(peer, tmp_peer, &peers, list) { - l2cap_chan_close(peer->chan, ENOENT); + l2cap_chan_hold(peer->chan); + chans[nchans++] = peer->chan; - list_del_rcu(&peer->list); - kfree_rcu(peer, rcu); - } - spin_unlock(&devices_lock); + if (nchans >= ARRAY_SIZE(chans)) + goto done; + } + } + +done: + spin_unlock(&devices_lock); + + for (i = 0; i < nchans; ++i) { + l2cap_chan_lock(chans[i]); + l2cap_chan_close(chans[i], ENOENT); + l2cap_chan_unlock(chans[i]); + l2cap_chan_put(chans[i]); + } + } while (nchans); } struct set_enable { @@ -1050,7 +1091,9 @@ static void do_enable_set(struct work_struct *work) mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } @@ -1103,13 +1146,15 @@ static ssize_t lowpan_control_write(struct file *fp, buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { - ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); listen_chan = NULL; } @@ -1140,7 +1185,7 @@ static ssize_t lowpan_control_write(struct file *fp, } if (memcmp(buf, "disconnect ", 11) == 0) { - ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true); if (ret < 0) return ret; @@ -1271,7 +1316,9 @@ static void __exit bt_6lowpan_exit(void) debugfs_remove(lowpan_control_debugfs); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c5dedf39a129..6fc0692abf05 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -769,21 +769,23 @@ static void find_bis(struct hci_conn *conn, void *data) d->count++; } -static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) +static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); + bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn, + conn->iso_qos.bcast.big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - d->big = big; + d->big = conn->iso_qos.bcast.big; d->sync_handle = conn->sync_handle; - if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { + if (conn->type == PA_LINK && + test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); @@ -801,6 +803,9 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->big_sync_term = true; } + if (!d->pa_sync_term && !d->big_sync_term) + return 0; + ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); if (ret) @@ -852,8 +857,7 @@ static void bis_cleanup(struct hci_conn *conn) hci_le_terminate_big(hdev, conn); } else { - hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, - conn); + hci_le_big_terminate(hdev, conn); } } @@ -994,19 +998,20 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case CIS_LINK: - case BIS_LINK: - case PA_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); - /* set proper cleanup function */ - if (!bacmp(dst, BDADDR_ANY)) - conn->cleanup = bis_cleanup; - else if (conn->role == HCI_ROLE_MASTER) + if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; - conn->mtu = hdev->iso_mtu ? hdev->iso_mtu : - hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; + conn->mtu = hdev->iso_mtu; + break; + case PA_LINK: + case BIS_LINK: + /* conn->src should reflect the local identity address */ + hci_copy_identity_address(hdev, &conn->src, &conn->src_type); + conn->cleanup = bis_cleanup; + conn->mtu = hdev->iso_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 3418d7b964a1..8ccec73dce45 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3832,13 +3832,14 @@ static void hci_tx_work(struct work_struct *work) static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3850,36 +3851,27 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.acl_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (conn) { - hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); - - /* Send to upper protocol */ - l2cap_recv_acldata(conn, skb, flags); - return; - } else { + err = l2cap_recv_acldata(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d", + handle, err); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3891,34 +3883,28 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.sco_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); + hci_skb_pkt_status(skb) = flags & 0x03; - if (conn) { - /* Send to upper protocol */ - hci_skb_pkt_status(skb) = flags & 0x03; - sco_recv_scodata(conn, skb); - return; - } else { + err = sco_recv_scodata(hdev, handle, skb); + if (err == -ENOENT) bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d", + handle, err); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3928,22 +3914,13 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (!conn) { + err = iso_recv(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); - goto drop; - } - - /* Send to upper protocol */ - iso_recv(conn, skb, flags); - return; - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d", + handle, err); } static bool hci_req_is_complete(struct hci_dev *hdev) @@ -4121,7 +4098,7 @@ static void hci_rx_work(struct work_struct *work) } } -static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) +static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; @@ -4133,16 +4110,19 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); - return; + return -EINVAL; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); - return; + return err; } atomic_dec(&hdev->cmd_cnt); + } else { + err = -ENODATA; + kfree_skb(skb); } if (hdev->req_status == HCI_REQ_PEND && @@ -4150,12 +4130,15 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } + + return err; } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; + int err; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); @@ -4166,7 +4149,9 @@ static void hci_cmd_work(struct work_struct *work) if (!skb) return; - hci_send_cmd_sync(hdev, skb); + err = hci_send_cmd_sync(hdev, skb); + if (err) + return; rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d37db364acf7..3838b90343d9 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4218,6 +4218,13 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } if (i == ARRAY_SIZE(hci_cc_table)) { + if (!skb->len) { + bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status", + *opcode); + *status = HCI_ERROR_UNSPECIFIED; + return; + } + /* Unknown opcode, assume byte 0 contains the status, so * that e.g. __hci_cmd_sync() properly returns errors * for vendor specific commands send by HCI drivers. @@ -5836,6 +5843,29 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_pa_sync_lost *ev = data; + u16 handle = le16_to_cpu(ev->handle); + struct hci_conn *conn; + + bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle); + if (conn) { + clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); + clear_bit(HCI_CONN_PA_SYNC, &conn->flags); + hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); +} + static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6994,14 +7024,9 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, continue; } - if (ev->status != 0x42) { + if (ev->status != 0x42) /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); - /* Reset cleanup callback of PA Sync so it doesn't - * terminate the sync when deleting the connection. - */ - conn->cleanup = NULL; - } bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; @@ -7044,29 +7069,24 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_lost *ev = data; - struct hci_conn *bis, *conn; - bool mgmt_conn; + struct hci_conn *bis; + bool mgmt_conn = false; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); hci_dev_lock(hdev); - /* Delete the pa sync connection */ - bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); - if (bis) { - conn = hci_conn_hash_lookup_pa_sync_handle(hdev, - bis->sync_handle); - if (conn) - hci_conn_del(conn); - } - /* Delete each bis connection */ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { - mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags); - mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type, - ev->reason, mgmt_conn); + if (!mgmt_conn) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, + &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, + bis->dst_type, ev->reason, + mgmt_conn); + } clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); @@ -7180,6 +7200,9 @@ static const struct hci_le_ev { hci_le_per_adv_report_evt, sizeof(struct hci_ev_le_per_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */ + HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt, + sizeof(struct hci_ev_le_pa_sync_lost)), /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index fc866759910d..ad19022ae127 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1311,7 +1311,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } + hci_dev_lock(hdev); mgmt_index_removed(hdev); + hci_dev_unlock(hdev); err = hci_dev_open(hdev->id); if (err) { diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 73fc41b68b68..6e76798ec786 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6999,7 +6999,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - if (!hci_conn_valid(hdev, conn)) + if (hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); if (!err) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3d98cb6291da..616c2fef91d2 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2314,14 +2314,31 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) iso_conn_del(hcon, bt_to_errno(reason)); } -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) { - struct iso_conn *conn = hcon->iso_data; + struct hci_conn *hcon; + struct iso_conn *conn; struct skb_shared_hwtstamps *hwts; __u16 pb, ts, len, sn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = iso_conn_hold_unless_zero(hcon->iso_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); @@ -2377,7 +2394,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) hci_skb_pkt_status(skb) = flags & 0x03; hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); - return; + goto done; } if (pb == ISO_SINGLE) { @@ -2455,6 +2472,9 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +done: + iso_conn_put(conn); + return 0; } static struct hci_cb iso_cb = { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index d08320380ad6..07b493331fd7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -497,6 +497,7 @@ void l2cap_chan_hold(struct l2cap_chan *c) kref_get(&c->kref); } +EXPORT_SYMBOL_GPL(l2cap_chan_hold); struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c) { @@ -7509,13 +7510,24 @@ struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) return c; } -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + struct hci_conn *hcon; struct l2cap_conn *conn; int len; - /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ - hci_dev_lock(hcon->hdev); + /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */ + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF); conn = hcon->l2cap_data; @@ -7523,12 +7535,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) conn = l2cap_conn_add(hcon); conn = l2cap_conn_hold_unless_zero(conn); + hcon = NULL; - hci_dev_unlock(hcon->hdev); + hci_dev_unlock(hdev); if (!conn) { kfree_skb(skb); - return; + return -EINVAL; } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); @@ -7642,6 +7655,7 @@ drop: unlock: mutex_unlock(&conn->lock); l2cap_conn_put(conn); + return 0; } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 24e335e3a727..262bf984d2aa 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5395,9 +5395,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_EXT_AD_LENGTH || - length > HCI_MAX_EXT_AD_LENGTH || - (offset + length) > HCI_MAX_EXT_AD_LENGTH) + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); @@ -9497,6 +9497,7 @@ void mgmt_index_removed(struct hci_dev *hdev) cancel_delayed_work_sync(&hdev->discov_off); cancel_delayed_work_sync(&hdev->service_cache); cancel_delayed_work_sync(&hdev->rpa_expired); + cancel_delayed_work_sync(&hdev->mesh_send_done); } void mgmt_power_on(struct hci_dev *hdev, int err) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ab0cf442d57b..298c2a9ab4df 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1458,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) sco_conn_del(hcon, bt_to_errno(reason)); } -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb) { - struct sco_conn *conn = hcon->sco_data; + struct hci_conn *hcon; + struct sco_conn *conn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = sco_conn_hold_unless_zero(hcon->sco_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } BT_DBG("conn %p len %u", conn, skb->len); - if (skb->len) { + if (skb->len) sco_recv_frame(conn, skb); - return; - } + else + kfree_skb(skb); -drop: - kfree_skb(skb); + sco_conn_put(conn); + return 0; } static struct hci_cb sco_cb = { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 45512b2ba951..3a1ce04a7a53 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2136,7 +2136,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; u8 *pkax, *pkbx, *na, *nb, confirm_hint; - u32 passkey; + u32 passkey = 0; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); @@ -2188,24 +2188,6 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); - - /* Only Just-Works pairing requires extra checks */ - if (smp->method != JUST_WORKS) - goto mackey_and_ltk; - - /* If there already exists long term key in local host, leave - * the decision to user space since the remote device could - * be legitimate or malicious. - */ - if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - hcon->role)) { - /* Set passkey to 0. The value can be any number since - * it'll be ignored anyway. - */ - passkey = 0; - confirm_hint = 1; - goto confirm; - } } mackey_and_ltk: @@ -2226,11 +2208,12 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; - confirm_hint = 0; - -confirm: - if (smp->method == JUST_WORKS) - confirm_hint = 1; + /* Always require user confirmation for Just-Works pairing to prevent + * impersonation attacks, or in case of a legitimate device that is + * repairing use the confirmation as acknowledgment to proceed with the + * creation of new keys. + */ + confirm_hint = smp->method == JUST_WORKS ? 1 : 0; err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 870bdf2e082c..dea09096ad0f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - (br_mst_is_enabled(p->br) || p->state == BR_STATE_FORWARDING) && + (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 98c5b9c3145f..ca3a637d7cca 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -386,6 +386,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_mst_uninit(br); br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 67b4c905e49a..777fa869c1a1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -94,7 +94,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br = p->br; - if (br_mst_is_enabled(br)) { + if (br_mst_is_enabled(p)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { @@ -429,7 +429,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; forward: - if (br_mst_is_enabled(p->br)) + if (br_mst_is_enabled(p)) goto defer_stp_filtering; switch (p->state) { diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 3f24b4ee49c2..43a300ae6bfa 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -22,6 +22,12 @@ bool br_mst_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_mst_enabled); +void br_mst_uninit(struct net_bridge *br) +{ + if (br_opt_get(br, BROPT_MST_ENABLED)) + static_branch_dec(&br_mst_used); +} + int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { const struct net_bridge_vlan_group *vg; @@ -225,9 +231,9 @@ int br_mst_set_enabled(struct net_bridge *br, bool on, return err; if (on) - static_branch_enable(&br_mst_used); + static_branch_inc(&br_mst_used); else - static_branch_disable(&br_mst_used); + static_branch_dec(&br_mst_used); br_opt_toggle(br, BROPT_MST_ENABLED, on); return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 16be5d250402..7280c4e9305f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1935,10 +1935,12 @@ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) /* br_mst.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING DECLARE_STATIC_KEY_FALSE(br_mst_used); -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { + /* check the port's vlan group to avoid racing with port deletion */ return static_branch_unlikely(&br_mst_used) && - br_opt_get(br, BROPT_MST_ENABLED); + br_opt_get(p->br, BROPT_MST_ENABLED) && + rcu_access_pointer(p->vlgrp); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, @@ -1952,8 +1954,9 @@ int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg); int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); +void br_mst_uninit(struct net_bridge *br); #else -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { return false; } @@ -1987,6 +1990,10 @@ static inline int br_mst_process(struct net_bridge_port *p, { return -EOPNOTSUPP; } + +static inline void br_mst_uninit(struct net_bridge *br) +{ +} #endif struct nf_br_ops { diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b71b1635916e..a21c157daf7d 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -631,6 +631,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, /* connection secret */ ceph_decode_32_safe(p, end, len, e_inval); + ceph_decode_need(p, end, len, e_inval); dout("%s connection secret blob len %d\n", __func__, len); if (len > 0) { dp = *p + ceph_x_encrypt_offset(); @@ -648,6 +649,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, /* service tickets */ ceph_decode_32_safe(p, end, len, e_inval); + ceph_decode_need(p, end, len, e_inval); dout("%s service tickets blob len %d\n", __func__, len); if (len > 0) { ret = ceph_x_proc_ticket_reply(ac, &th->session_key, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4c6441536d55..e734e57be083 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -786,41 +786,52 @@ void ceph_reset_client_addr(struct ceph_client *client) EXPORT_SYMBOL(ceph_reset_client_addr); /* - * true if we have the mon map (and have thus joined the cluster) - */ -static bool have_mon_and_osd_map(struct ceph_client *client) -{ - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; -} - -/* * mount: join the ceph cluster, and open root directory. */ -int __ceph_open_session(struct ceph_client *client, unsigned long started) +int __ceph_open_session(struct ceph_client *client) { - unsigned long timeout = client->options->mount_timeout; - long err; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + long timeout = ceph_timeout_jiffies(client->options->mount_timeout); + bool have_monmap, have_osdmap; + int err; /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); if (err < 0) return err; - while (!have_mon_and_osd_map(client)) { - if (timeout && time_after_eq(jiffies, started + timeout)) - return -ETIMEDOUT; + add_wait_queue(&client->auth_wq, &wait); + for (;;) { + mutex_lock(&client->monc.mutex); + err = client->auth_err; + have_monmap = client->monc.monmap && client->monc.monmap->epoch; + mutex_unlock(&client->monc.mutex); + + down_read(&client->osdc.lock); + have_osdmap = client->osdc.osdmap && client->osdc.osdmap->epoch; + up_read(&client->osdc.lock); + + if (err || (have_monmap && have_osdmap)) + break; + + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + if (!timeout) { + err = -ETIMEDOUT; + break; + } /* wait */ dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - ceph_timeout_jiffies(timeout)); - if (err < 0) - return err; - if (client->auth_err < 0) - return client->auth_err; + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); } + remove_wait_queue(&client->auth_wq, &wait); + + if (err) + return err; pr_info("client%llu fsid %pU\n", ceph_client_gid(client), &client->fsid); @@ -833,12 +844,11 @@ EXPORT_SYMBOL(__ceph_open_session); int ceph_open_session(struct ceph_client *client) { int ret; - unsigned long started = jiffies; /* note the start time */ dout("open_session start\n"); mutex_lock(&client->mount_mutex); - ret = __ceph_open_session(client, started); + ret = __ceph_open_session(client); mutex_unlock(&client->mount_mutex); return ret; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 2110439f8a24..83c270bce63c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -36,8 +36,9 @@ static int monmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; + mutex_lock(&client->monc.mutex); if (client->monc.monmap == NULL) - return 0; + goto out_unlock; seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); for (i = 0; i < client->monc.monmap->num_mon; i++) { @@ -48,6 +49,9 @@ static int monmap_show(struct seq_file *s, void *p) ENTITY_NAME(inst->name), ceph_pr_addr(&inst->addr)); } + +out_unlock: + mutex_unlock(&client->monc.mutex); return 0; } @@ -56,13 +60,14 @@ static int osdmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; struct ceph_osd_client *osdc = &client->osdc; - struct ceph_osdmap *map = osdc->osdmap; + struct ceph_osdmap *map; struct rb_node *n; + down_read(&osdc->lock); + map = osdc->osdmap; if (map == NULL) - return 0; + goto out_unlock; - down_read(&osdc->lock); seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, osdc->epoch_barrier, map->flags); @@ -131,6 +136,7 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "]\n"); } +out_unlock: up_read(&osdc->lock); return 0; } diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 9e39378eda00..9e48623018a3 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1061,13 +1061,16 @@ static int decrypt_control_remainder(struct ceph_connection *con) static int process_v2_sparse_read(struct ceph_connection *con, struct page **pages, int spos) { - struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + struct ceph_msg_data_cursor cursor; int ret; + ceph_msg_data_cursor_init(&cursor, con->in_msg, + con->in_msg->sparse_read_total); + for (;;) { char *buf = NULL; - ret = con->ops->sparse_read(con, cursor, &buf); + ret = con->ops->sparse_read(con, &cursor, &buf); if (ret <= 0) return ret; @@ -1085,11 +1088,11 @@ static int process_v2_sparse_read(struct ceph_connection *con, } else { struct bio_vec bv; - get_bvec_at(cursor, &bv); + get_bvec_at(&cursor, &bv); len = min_t(int, len, bv.bv_len); memcpy_page(bv.bv_page, bv.bv_offset, spage, soff, len); - ceph_msg_data_advance(cursor, len); + ceph_msg_data_advance(&cursor, len); } spos += len; ret -= len; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 295098873861..d245fa508e1c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1504,8 +1504,6 @@ static int decode_new_primary_temp(void **p, void *end, u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1514,8 +1512,6 @@ u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) { int i; @@ -1577,6 +1573,8 @@ static int decode_new_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, aff, e_inval); + if (osd >= map->max_osd) + goto e_inval; ret = set_primary_affinity(map, osd, aff); if (ret) @@ -1879,7 +1877,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, ceph_decode_need(p, end, 2*sizeof(u32), e_inval); osd = ceph_decode_32(p); w = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); + if (osd >= map->max_osd) + goto e_inval; + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, w == CEPH_OSD_IN ? "(in)" : (w == CEPH_OSD_OUT ? "(out)" : "")); @@ -1905,13 +1905,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, u32 xorstate; osd = ceph_decode_32(p); + if (osd >= map->max_osd) + goto e_inval; + if (struct_v >= 5) xorstate = ceph_decode_32(p); else xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; - BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) osdmap_info(map, "osd%d down\n", osd); @@ -1937,7 +1939,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; osd = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); + if (osd >= map->max_osd) + goto e_inval; + if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index ad54b12d4b4c..8bb71a10dba0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -443,6 +443,9 @@ static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct ifreq ifrr; int err; + if (!kernel_cfg->ifr) + return -EINVAL; + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index fd57b845de33..a725d21159a6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -60,9 +60,10 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) struct sk_buff *skb; int work_done = 0; - __local_lock_nested_bh(&cell->bh_lock); while (work_done < budget) { + __local_lock_nested_bh(&cell->bh_lock); skb = __skb_dequeue(&cell->napi_skbs); + __local_unlock_nested_bh(&cell->bh_lock); if (!skb) break; napi_gro_receive(napi, skb); @@ -71,7 +72,6 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) if (work_done < budget) napi_complete_done(napi, work_done); - __local_unlock_nested_bh(&cell->bh_lock); return work_done; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0e0f22d7b21..83cbec4afcb3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -439,7 +439,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = ns_tree_gen_id(&net->ns); + net->net_cookie = ns_tree_gen_id(net); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 60a05d3b7c24..331764845e8f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -228,19 +228,16 @@ static void refill_skbs(struct netpoll *np) { struct sk_buff_head *skb_pool; struct sk_buff *skb; - unsigned long flags; skb_pool = &np->skb_pool; - spin_lock_irqsave(&skb_pool->lock, flags); - while (skb_pool->qlen < MAX_SKBS) { + while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(skb_pool, skb); + skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) @@ -814,6 +811,10 @@ static void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; + /* At this point, there is a single npinfo instance per netdevice, and + * its refcnt tracks how many netpoll structures are linked to it. We + * only perform npinfo cleanup when the refcnt decrements to zero. + */ if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -823,8 +824,7 @@ static void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); - } else - RCU_INIT_POINTER(np->dev->npinfo, NULL); + } skb_pool_flush(np); } diff --git a/net/core/scm.c b/net/core/scm.c index 66eaee783e8b..cd87f66671aa 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -273,17 +273,13 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) check_object_size(data, cmlen - sizeof(*cm), true); - if (can_do_masked_user_access()) - cm = masked_user_access_begin(cm); - else if (!user_write_access_begin(cm, cmlen)) - goto efault; - - unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); - unsafe_put_user(level, &cm->cmsg_level, efault_end); - unsafe_put_user(type, &cm->cmsg_type, efault_end); - unsafe_copy_to_user(CMSG_USER_DATA(cm), data, - cmlen - sizeof(*cm), efault_end); - user_write_access_end(); + scoped_user_write_access_size(cm, cmlen, efault) { + unsafe_put_user(cmlen, &cm->cmsg_len, efault); + unsafe_put_user(level, &cm->cmsg_level, efault); + unsafe_put_user(type, &cm->cmsg_type, efault); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault); + } } else { struct cmsghdr *cm = msg->msg_control; @@ -301,8 +297,6 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_controllen -= cmlen; return 0; -efault_end: - user_write_access_end(); efault: return -EFAULT; } diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 264fb82cba19..d157a8419bca 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -828,13 +828,15 @@ void devl_rate_nodes_destroy(struct devlink *devlink) if (!devlink_rate->parent) continue; - refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); + + refcount_dec(&devlink_rate->parent->refcnt); + devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 82b084cc1cc6..53da62984447 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -78,7 +78,6 @@ int dns_query(struct net *net, { struct key *rkey; struct user_key_payload *upayload; - const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; int ret, len; @@ -124,9 +123,8 @@ int dns_query(struct net *net, /* make the upcall, using special credentials to prevent the use of * add_key() to preinstall malicious redirections */ - saved_cred = override_creds(dns_resolver_cache); - rkey = request_key_net(&key_type_dns_resolver, desc, net, options); - revert_creds(saved_cred); + scoped_with_creds(dns_resolver_cache) + rkey = request_key_net(&key_type_dns_resolver, desc, net, options); kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 26bb657ceac3..eadb358179ce 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -176,7 +176,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -224,12 +225,14 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, { int len = BRCM_LEG_TAG_LEN; int source_port; + __be16 *proto; u8 *brcm_tag; if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); + proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN); source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; @@ -237,14 +240,19 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, if (!skb->dev) return NULL; - /* VLAN tag is added by BCM63xx internal switch */ - if (netdev_uses_dsa(skb->dev)) + /* The internal switch in BCM63XX SoCs always tags on egress on the CPU + * port. We use VID 0 internally for untagged traffic, so strip the tag + * if the TCI field is all 0, and keep it otherwise to also retain + * e.g. 802.1p tagged packets. + */ + if (proto[0] == htons(ETH_P_8021Q) && proto[1] == 0) len += VLAN_HLEN; /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, len); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); dsa_strip_etype_header(skb, len); diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 7e46d130dce2..1d33a4675a48 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -93,7 +93,7 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; struct socket *sock; - int class, fd, err; + int class, err; err = -EOPNOTSUPP; if (!hn) @@ -106,27 +106,25 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) err = -EAGAIN; req = handshake_req_next(hn, class); - if (!req) - goto out_status; - - sock = req->hr_sk->sk_socket; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_complete; - } - - err = req->hr_proto->hp_accept(req, info, fd); - if (err) { - put_unused_fd(fd); - goto out_complete; + if (req) { + sock = req->hr_sk->sk_socket; + + FD_PREPARE(fdf, O_CLOEXEC, sock->file); + if (fdf.err) { + err = fdf.err; + goto out_complete; + } + + get_file(sock->file); /* FD_PREPARE() consumes a reference. */ + err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); + if (err) + goto out_complete; /* Automatic cleanup handles fput */ + + trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf)); + fd_publish(fdf); + return 0; } - fd_install(fd, get_file(sock->file)); - - trace_handshake_cmd_accept(net, req, req->hr_sk, fd); - return 0; - out_complete: handshake_complete(req, -EIO, NULL); out_status: diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index 081093dfd553..8f9532a15f43 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -259,6 +259,7 @@ static int tls_handshake_accept(struct handshake_req *req, out_cancel: genlmsg_cancel(msg, hdr); + nlmsg_free(msg); out: return ret; } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fbbc3ccf9df6..492cbc78ab75 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -320,6 +320,9 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + skb_reset_mac_len(skb); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); @@ -334,7 +337,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag->tlv.HSR_TLV_type = type; - /* TODO: Why 12 in HSRv0? */ + /* HSRv0 has 6 unused bytes after the MAC */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index c67c0d35921d..339f0d220212 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -262,15 +262,23 @@ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, return skb; } -static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, +static void hsr_set_path_id(struct hsr_frame_info *frame, + struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; - if (port->type == HSR_PT_SLAVE_A) - path_id = 0; - else - path_id = 1; + if (port->hsr->prot_version) { + if (port->type == HSR_PT_SLAVE_A) + path_id = 0; + else + path_id = 1; + } else { + if (frame->is_supervision) + path_id = 0xf; + else + path_id = 1; + } set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } @@ -304,7 +312,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, else hsr_ethhdr = (struct hsr_ethhdr *)pc; - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; @@ -330,7 +338,7 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0d94270da28..05828d4cb6cd 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) - : htons(ETH_P_IP); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) + : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6d27d3610c1c..b549d6a57307 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -607,6 +607,11 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) oldest_p = fnhe_p; } } + + /* Clear oldest->fnhe_daddr to prevent this fnhe from being + * rebound with new dsts in rt_bind_exception(). + */ + oldest->fnhe_daddr = 0; fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 7b41fb4f00b5..22410243ebe8 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) - : htons(ETH_P_IPV6); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index b4f01cb07561..5dd7e0509a48 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1560,24 +1560,16 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct file *file; - info.fd = get_unused_fd_flags(0); - if (unlikely(info.fd < 0)) - return info.fd; + FD_PREPARE(fdf, 0, kcm_clone(sock)); + if (fdf.err) + return fdf.err; - file = kcm_clone(sock); - if (IS_ERR(file)) { - put_unused_fd(info.fd); - return PTR_ERR(file); - } - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - put_unused_fd(info.fd); - fput(file); + info.fd = fd_prepare_fd(fdf); + if (copy_to_user((void __user *)arg, &info, sizeof(info))) return -EFAULT; - } - fd_install(info.fd, file); + + fd_publish(fdf); err = 0; break; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 369a2f2e459c..0710281dd95a 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1246,9 +1246,9 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns else l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); - /* Reset skb netfilter state */ - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); + /* Reset control buffer */ + memset(skb->cb, 0, sizeof(skb->cb)); + nf_reset_ct(skb); /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 57065714cf8c..7f8799fd673e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1290,7 +1290,7 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) &link->csa.finalize_work); break; case NL80211_IFTYPE_STATION: - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 73fd86ec1bce..878c3b14aeb8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -612,11 +612,11 @@ struct ieee80211_if_managed { u8 *assoc_req_ies; size_t assoc_req_ies_len; - struct wiphy_delayed_work ml_reconf_work; + struct wiphy_hrtimer_work ml_reconf_work; u16 removed_links; /* TID-to-link mapping support */ - struct wiphy_delayed_work ttlm_work; + struct wiphy_hrtimer_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; struct wiphy_work teardown_ttlm_work; @@ -1017,10 +1017,10 @@ struct ieee80211_link_data_managed { bool operating_11g_mode; struct { - struct wiphy_delayed_work switch_work; + struct wiphy_hrtimer_work switch_work; struct cfg80211_chan_def ap_chandef; struct ieee80211_parsed_tpe tpe; - unsigned long time; + ktime_t time; bool waiting_bcn; bool ignored_same_chan; bool blocked_tx; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index a7873832d4fa..0ca55b9655a7 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -223,6 +223,10 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata if (netif_carrier_ok(sdata->dev)) return -EBUSY; + /* if any stations are set known (so they know this vif too), reject */ + if (sta_info_get_by_idx(sdata, 0)) + return -EBUSY; + /* First check no ROC work is happening on this iface */ list_for_each_entry(roc, &local->roc_list, list) { if (roc->sdata != sdata) @@ -242,12 +246,16 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata ret = -EBUSY; } + /* + * More interface types could be added here but changing the + * address while powered makes the most sense in client modes. + */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - /* More interface types could be added here but changing the - * address while powered makes the most sense in client modes. - */ + /* refuse while connecting */ + if (sdata->u.mgd.auth_data || sdata->u.mgd.assoc_data) + return -EBUSY; break; default: ret = -EOPNOTSUPP; diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d71eabe5abf8..4a19b765ccb6 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -472,10 +472,10 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, * from there. */ if (link->conf->csa_active) - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - - jiffies); + ktime_get_boottime()); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3b5827ea438e..f3138d158535 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -45,7 +45,7 @@ #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 -#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) +#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS (100 * USEC_PER_MSEC) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) @@ -2594,7 +2594,7 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, return; } - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } @@ -2753,7 +2753,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, .timestamp = timestamp, .device_timestamp = device_timestamp, }; - unsigned long now; + u32 csa_time_tu; + ktime_t now; int res; lockdep_assert_wiphy(local->hw.wiphy); @@ -2983,10 +2984,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ - now = jiffies; - link->u.mgd.csa.time = now + - TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * - link->conf->beacon_int); + now = ktime_get_boottime(); + csa_time_tu = (max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int; + link->u.mgd.csa.time = now + us_to_ktime(ieee80211_tu_to_usec(csa_time_tu)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { @@ -3001,7 +3001,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } /* channel switch handled in software */ - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; @@ -4242,14 +4242,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, @@ -6876,7 +6876,7 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; @@ -6906,9 +6906,9 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, } sdata->u.mgd.removed_links = removed_links; - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, - TU_TO_JIFFIES(delay)); + us_to_ktime(ieee80211_tu_to_usec(delay))); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, @@ -7095,7 +7095,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, /* if a planned TID-to-link mapping was cancelled - * abort it */ - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in @@ -7130,7 +7130,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; - u32 delay_jiffies; + u64 delay_usec; u64 mask; /* The t2l map switch time is indicated with a partial @@ -7152,23 +7152,23 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; - delay_jiffies = TU_TO_JIFFIES(delay); + delay_usec = ieee80211_tu_to_usec(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ - if (delay_jiffies > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) - delay_jiffies -= + if (delay_usec > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) + delay_usec -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else - delay_jiffies = 0; + delay_usec = 0; sdata->u.mgd.ttlm_info = ttlm_info; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, - delay_jiffies); + us_to_ktime(delay_usec)); return; } } @@ -8793,7 +8793,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_csa_connection_drop_work); wiphy_delayed_work_init(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); - wiphy_delayed_work_init(&ifmgd->ml_reconf_work, + wiphy_hrtimer_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); wiphy_delayed_work_init(&ifmgd->reconf.wk, ieee80211_ml_sta_reconf_timeout); @@ -8802,7 +8802,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); wiphy_delayed_work_init(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); - wiphy_delayed_work_init(&ifmgd->ttlm_work, + wiphy_hrtimer_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); @@ -8849,7 +8849,7 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; - wiphy_delayed_work_init(&link->u.mgd.csa.switch_work, + wiphy_hrtimer_work_init(&link->u.mgd.csa.switch_work, ieee80211_csa_switch_work); ieee80211_clear_tpe(&link->conf->tpe); @@ -10064,7 +10064,7 @@ void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) &link->u.mgd.request_smps_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); - wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work); } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6af43dfefdd6..5b4c3fe9970a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5360,10 +5360,14 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, if (WARN_ON(!local->started)) goto drop; - if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC) && + !(status->flag & RX_FLAG_NO_PSDU && + status->zero_length_psdu_type == + IEEE80211_RADIOTAP_ZERO_LEN_PSDU_NOT_CAPTURED))) { /* - * Validate the rate, unless a PLCP error means that - * we probably can't have a valid rate here anyway. + * Validate the rate, unless there was a PLCP error which may + * have an invalid rate or the PSDU was not capture and may be + * missing rate information. */ switch (status->encoding) { diff --git a/net/mctp/route.c b/net/mctp/route.c index 4d314e062ba9..2ac4011a953f 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -623,6 +623,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) skb->protocol = htons(ETH_P_MCTP); skb->pkt_type = PACKET_OUTGOING; + skb->dev = dst->dev->dev; if (skb->len > dst->mtu) { kfree_skb(skb); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1103b3341a70..f24ae7d40e88 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -838,8 +838,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + /* Force later mptcp_write_options(), but do not use any actual + * option space. + */ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) - return false; + return true; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || @@ -1041,6 +1044,31 @@ static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) WRITE_ONCE(msk->snd_una, new_snd_una); } +static void rwin_update(struct mptcp_sock *msk, struct sock *ssk, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct tcp_sock *tp = tcp_sk(ssk); + u64 mptcp_rcv_wnd; + + /* Avoid touching extra cachelines if TCP is going to accept this + * skb without filling the TCP-level window even with a possibly + * outdated mptcp-level rwin. + */ + if (!skb->len || skb->len < tcp_receive_window(tp)) + return; + + mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent); + if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent)) + return; + + /* Some other subflow grew the mptcp-level rwin since rcv_wup, + * resync. + */ + tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; + subflow->rcv_wnd_sent = mptcp_rcv_wnd; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1208,6 +1236,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); + rwin_update(msk, sk, skb); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -1294,6 +1323,10 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) if (rcv_wnd_new != rcv_wnd_old) { raise_win: + /* The msk-level rcv wnd is after the tcp level one, + * sync the latter. + */ + rcv_wnd_new = rcv_wnd_old; win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; @@ -1317,6 +1350,21 @@ raise_win: update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); + subflow->rcv_wnd_sent = rcv_wnd_new; +} + +static void mptcp_track_rwin(struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + + if (!ssk) + return; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) @@ -1611,6 +1659,10 @@ mp_rst: opts->reset_transient, opts->reset_reason); return; + } else if (unlikely(!opts->suboptions)) { + /* Fallback to TCP */ + mptcp_track_rwin(tp); + return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 2ff1b9499568..9604b91902b8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -18,6 +18,7 @@ struct mptcp_pm_add_entry { u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; + struct rcu_head rcu; }; static DEFINE_SPINLOCK(mptcp_pm_list_lock); @@ -155,7 +156,7 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, entry = mptcp_pm_del_add_timer(msk, addr, false); ret = entry; - kfree(entry); + kfree_rcu(entry, rcu); return ret; } @@ -345,22 +346,27 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; + bool stop_timer = false; + + rcu_read_lock(); spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; + stop_timer = true; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); - /* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); + /* Note: entry might have been removed by another thread. + * We hold rcu_read_lock() to ensure it is not freed under us. + */ + if (stop_timer) + sk_stop_timer_sync(sk, &entry->add_timer); + rcu_read_unlock(); return entry; } @@ -415,7 +421,7 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); + kfree_rcu(entry, rcu); } } diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 2ae95476dba3..0a50fd5edc06 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -672,7 +672,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { - if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + if (rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2d6b8de35c44..1e413426deee 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -61,11 +61,13 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { + unsigned short family = READ_ONCE(sk->sk_family); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (sk->sk_prot == &tcpv6_prot) + if (family == AF_INET6) return &inet6_stream_ops; #endif - WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + WARN_ON_ONCE(family != AF_INET); return &inet_stream_ops; } @@ -76,6 +78,13 @@ bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib) if (__mptcp_check_fallback(msk)) return true; + /* The caller possibly is not holding the msk socket lock, but + * in the fallback case only the current subflow is touching + * the OoO queue. + */ + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + return false; + spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); @@ -935,14 +944,19 @@ static void mptcp_reset_rtx_timer(struct sock *sk) bool mptcp_schedule_work(struct sock *sk) { - if (inet_sk_state_load(sk) != TCP_CLOSE && - schedule_work(&mptcp_sk(sk)->work)) { - /* each subflow already holds a reference to the sk, and the - * workqueue is invoked by a subflow, so sk can't go away here. - */ - sock_hold(sk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return false; + + /* Get a reference on this socket, mptcp_worker() will release it. + * As mptcp_worker() might complete before us, we can not avoid + * a sock_hold()/sock_put() if schedule_work() returns false. + */ + sock_hold(sk); + + if (schedule_work(&mptcp_sk(sk)->work)) return true; - } + + sock_put(sk); return false; } @@ -2397,7 +2411,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) -#define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches @@ -2408,7 +2421,7 @@ static void __mptcp_subflow_disconnect(struct sock *ssk, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - (flags & MPTCP_CF_FASTCLOSE)) { + subflow->send_fastclose) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ @@ -2455,14 +2468,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { - /* be sure to force the tcp_close path - * to generate the egress reset - */ - ssk->sk_lingertime = 0; - sock_set_flag(ssk, SOCK_LINGER); - subflow->send_fastclose = 1; - } + if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) + tcp_set_state(ssk, TCP_CLOSE); need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2558,7 +2565,8 @@ static void __mptcp_close_subflow(struct sock *sk) if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || - inet_sk_state_load(sk) != TCP_ESTABLISHED)) + inet_sk_state_load(sk) != TCP_ESTABLISHED || + __mptcp_check_fallback(msk))) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ @@ -2657,7 +2665,7 @@ static void __mptcp_retrans(struct sock *sk) } if (!mptcp_send_head(sk)) - return; + goto clear_scheduled; goto reset_timer; } @@ -2688,7 +2696,7 @@ static void __mptcp_retrans(struct sock *sk) if (__mptcp_check_fallback(msk)) { spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); - return; + goto clear_scheduled; } while (info.sent < info.limit) { @@ -2720,6 +2728,15 @@ reset_timer: if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); + +clear_scheduled: + /* If no rtx data was available or in case of fallback, there + * could be left-over scheduled subflows; clear them all + * or later xmit could use bad ones + */ + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->scheduled)) + mptcp_subflow_set_scheduled(subflow, false); } /* schedule the timeout timer for the relevant event: either close timeout @@ -2766,9 +2783,32 @@ static void mptcp_do_fastclose(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); - mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), - subflow, MPTCP_CF_FASTCLOSE); + + /* Explicitly send the fastclose reset as need */ + if (__mptcp_check_fallback(msk)) + return; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* Some subflow socket states don't allow/need a reset.*/ + if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto unlock; + + subflow->send_fastclose = 1; + + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window(), see tcp_disconnect(). + */ + inet_csk(ssk)->icsk_ack.rcv_mss = TCP_MIN_MSS; + + tcp_send_active_reset(ssk, ssk->sk_allocation, + SK_RST_REASON_TCP_ABORT_ON_CLOSE); +unlock: + release_sock(ssk); + } } static void mptcp_worker(struct work_struct *work) @@ -2795,7 +2835,11 @@ static void mptcp_worker(struct work_struct *work) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { + struct mptcp_subflow_context *subflow, *tmp; + mptcp_do_fastclose(sk); + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0); mptcp_close_wake_up(sk); } @@ -3220,7 +3264,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* msk->subflow is still intact, the following will not free the first * subflow */ - mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + mptcp_do_fastclose(sk); + mptcp_destroy_common(msk); /* The first subflow is already in TCP_CLOSE status, the following * can't overlap with a fallback anymore @@ -3399,7 +3444,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) +void mptcp_destroy_common(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; @@ -3408,7 +3453,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); @@ -3426,7 +3471,7 @@ static void mptcp_destroy(struct sock *sk) /* allow the following to close even the initial subflow */ msk->free_first = 1; - mptcp_destroy_common(msk, 0); + mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 379a88e14e8d..6ca97096607c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -509,6 +509,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u64 map_seq; + u64 rcv_wnd_sent; u32 snd_isn; u32 token; u32 rel_write_seq; @@ -976,7 +977,7 @@ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) local_bh_enable(); } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); +void mptcp_destroy_common(struct mptcp_sock *msk); #define MPTCP_TOKEN_MAX_RETRIES 4 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e8325890a322..af707ce0f624 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2144,6 +2144,10 @@ void __init mptcp_subflow_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcp_prot_override.psock_update_sk_prot = NULL; +#endif #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock @@ -2180,6 +2184,10 @@ void __init mptcp_subflow_init(void) tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcpv6_prot_override.psock_update_sk_prot = NULL; +#endif #endif mptcp_diag_subflow_init(&subflow_ulp_ops); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2832e0794197..792ca44a461d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -572,69 +572,6 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - struct nshhdr *nh; - size_t length; - int err; - u8 flags; - u8 ttl; - int i; - - struct ovs_key_nsh key; - struct ovs_key_nsh mask; - - err = nsh_key_from_nlattr(a, &key, &mask); - if (err) - return err; - - /* Make sure the NSH base header is there */ - if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) - return -ENOMEM; - - nh = nsh_hdr(skb); - length = nsh_hdr_len(nh); - - /* Make sure the whole NSH header is there */ - err = skb_ensure_writable(skb, skb_network_offset(skb) + - length); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - skb_postpull_rcsum(skb, nh, length); - flags = nsh_get_flags(nh); - flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); - flow_key->nsh.base.flags = flags; - ttl = nsh_get_ttl(nh); - ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); - flow_key->nsh.base.ttl = ttl; - nsh_set_flags_and_ttl(nh, flags, ttl); - nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, - mask.base.path_hdr); - flow_key->nsh.base.path_hdr = nh->path_hdr; - switch (nh->mdtype) { - case NSH_M_TYPE1: - for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { - nh->md1.context[i] = - OVS_MASKED(nh->md1.context[i], key.context[i], - mask.context[i]); - } - memcpy(flow_key->nsh.context, nh->md1.context, - sizeof(nh->md1.context)); - break; - case NSH_M_TYPE2: - memset(flow_key->nsh.context, 0, - sizeof(flow_key->nsh.context)); - break; - default: - return -EINVAL; - } - skb_postpush_rcsum(skb, nh, length); - return 0; -} - /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1130,10 +1067,6 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; - case OVS_KEY_ATTR_NSH: - err = set_nsh(skb, flow_key, a); - break; - case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1170,6 +1103,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: + case OVS_KEY_ATTR_NSH: err = -EINVAL; break; } diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ad64bb9ab5e2..1cb4f97335d8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1305,6 +1305,11 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +/* + * Constructs NSH header 'nh' from attributes of OVS_ACTION_ATTR_PUSH_NSH, + * where 'nh' points to a memory block of 'size' bytes. It's assumed that + * attributes were previously validated with validate_push_nsh(). + */ int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size) { @@ -1314,8 +1319,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, u8 ttl = 0; int mdlen = 0; - /* validate_nsh has check this, so we needn't do duplicate check here - */ if (size < NSH_BASE_HDR_LEN) return -ENOBUFS; @@ -1359,46 +1362,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, return 0; } -int nsh_key_from_nlattr(const struct nlattr *attr, - struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) -{ - struct nlattr *a; - int rem; - - /* validate_nsh has check this, so we needn't do duplicate check here - */ - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - const struct ovs_nsh_key_base *base_mask = base + 1; - - nsh->base = *base; - nsh_mask->base = *base_mask; - break; - } - case OVS_NSH_KEY_ATTR_MD1: { - const struct ovs_nsh_key_md1 *md1 = nla_data(a); - const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; - - memcpy(nsh->context, md1->context, sizeof(*md1)); - memcpy(nsh_mask->context, md1_mask->context, - sizeof(*md1_mask)); - break; - } - case OVS_NSH_KEY_ATTR_MD2: - /* Not supported yet */ - return -ENOTSUPP; - default: - return -EINVAL; - } - } - - return 0; -} - static int nsh_key_put_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool is_push_nsh, bool log) @@ -2839,17 +2802,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } -static bool validate_nsh(const struct nlattr *attr, bool is_mask, - bool is_push_nsh, bool log) +static bool validate_push_nsh(const struct nlattr *attr, bool log) { struct sw_flow_match match; struct sw_flow_key key; - int ret = 0; ovs_match_init(&match, &key, true, NULL); - ret = nsh_key_put_from_nlattr(attr, &match, is_mask, - is_push_nsh, log); - return !ret; + return !nsh_key_put_from_nlattr(attr, &match, false, true, log); } /* Return false if there are any non-masked bits set. @@ -2997,13 +2956,6 @@ static int validate_set(const struct nlattr *a, break; - case OVS_KEY_ATTR_NSH: - if (eth_type != htons(ETH_P_NSH)) - return -EINVAL; - if (!validate_nsh(nla_data(a), masked, false, log)) - return -EINVAL; - break; - default: return -EINVAL; } @@ -3437,7 +3389,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; } mac_proto = MAC_PROTO_NONE; - if (!validate_nsh(nla_data(a), false, true, true)) + if (!validate_push_nsh(nla_data(a), log)) return -EINVAL; break; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index fe7f77fc5f18..ff8cdecbe346 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -65,8 +65,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); -int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, - struct ovs_key_nsh *nsh_mask); int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 396b576390d0..c2b5bc19e091 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,12 +47,10 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 3e89927d7116..26ba8c2d20ab 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -195,13 +195,15 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_connmark_parms *parms; - struct tc_connmark opt = { - .index = ci->tcf_index, - .refcnt = refcount_read(&ci->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, - }; + struct tc_connmark opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ci->tcf_index; + opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind; + rcu_read_lock(); parms = rcu_dereference(ci->parms); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 107c6d83dc5c..7c6975632fc2 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -644,13 +644,15 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - struct tc_ife opt = { - .index = ife->tcf_index, - .refcnt = refcount_read(&ife->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - }; + struct tc_ife opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ife->tcf_index, + opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref, + opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, + spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7fbe42f0e5c2..a32754a2658b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -97,12 +97,10 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 5337bc462755..2d27f91d8441 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -99,6 +99,9 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, int i; const struct can_filter *lp; + if (!pskb_may_pull(skb, CAN_MTU)) + return 0; + can_id = em_canid_get_id(skb); if (can_id & CAN_EFF_FLAG) { diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index 64b637f18bc7..48c1bce74f49 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; - unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer); u32 val = 0; + if (!ptr) + return 0; + ptr += cmp->off; if (!tcf_valid_offset(skb, ptr, cmp->align)) return 0; diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 4f9f21a05d5e..c65ffa5fff94 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, struct nbyte_data *nbyte = (struct nbyte_data *) em->data; unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + if (!ptr) + return 0; ptr += nbyte->hdr.off; if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 6b3d0af72c39..692e2be1793e 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct text_match *tm = EM_TEXT_PRIV(m); + unsigned char *ptr; int from, to; - from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->from_layer); + if (!ptr) + return 0; + from = ptr - skb->data; from += tm->from_offset; - to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->to_layer); + if (!ptr) + return 0; + to = ptr - skb->data; to += tm->to_offset; return skb_find_text(skb, from, to, tm->config) != UINT_MAX; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1e058b46d3e1..f56b18c8aebf 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1599,6 +1599,11 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } + if (p->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot add children to ingress/clsact qdisc"); + return -EOPNOTSUPP; + } q = qdisc_leaf(p, clid, extack); if (IS_ERR(q)) return PTR_ERR(q); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1e008a228ebd..7dee9748a56b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -180,9 +180,10 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, - int *packets) + int *packets, int budget) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; + int cnt = 0; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); @@ -193,8 +194,10 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; - (*packets)++; /* GSO counts as one pkt */ + if (++cnt >= budget) + break; } + (*packets) += cnt; skb_mark_not_on_list(skb); } @@ -228,7 +231,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, - int *packets) + int *packets, int budget) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; @@ -295,7 +298,7 @@ validate: if (skb) { bulk: if (qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + try_bulk_dequeue_skb(q, skb, txq, packets, budget); else try_bulk_dequeue_skb_slow(q, skb, packets); } @@ -387,7 +390,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline bool qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; @@ -396,7 +399,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) bool validate; /* Dequeue packet */ - skb = dequeue_skb(q, &validate, packets); + skb = dequeue_skb(q, &validate, packets, budget); if (unlikely(!skb)) return false; @@ -414,7 +417,7 @@ void __qdisc_run(struct Qdisc *q) int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; - while (qdisc_restart(q, &packets)) { + while (qdisc_restart(q, &packets, quota)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 996c2018f0e6..2afb376299fe 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -73,19 +73,26 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, struct nlattr *attr; void *info = NULL; + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) addrcnt++; + rcu_read_unlock(); attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); if (!attr) return -EMSGSIZE; info = nla_data(attr); + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) { memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; + + if (!--addrcnt) + break; } + rcu_read_unlock(); return 0; } @@ -223,14 +230,15 @@ struct sctp_comm_param { bool net_admin; }; -static size_t inet_assoc_attr_size(struct sctp_association *asoc) +static size_t inet_assoc_attr_size(struct sock *sk, + struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct sctp_sockaddr_entry *laddr; list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, - list) + list, lockdep_sock_is_held(sk)) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) @@ -256,11 +264,14 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t if (err) return err; - rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); - if (!rep) + lock_sock(sk); + + rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL); + if (!rep) { + release_sock(sk); return -ENOMEM; + } - lock_sock(sk); if (ep != assoc->ep) { err = -EAGAIN; goto out; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4d258a6e8033..0c56d9673cc1 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -37,10 +37,10 @@ /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ -static struct sctp_transport *sctp_transport_init(struct net *net, - struct sctp_transport *peer, - const union sctp_addr *addr, - gfp_t gfp) +static void sctp_transport_init(struct net *net, + struct sctp_transport *peer, + const union sctp_addr *addr, + gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); @@ -83,8 +83,6 @@ static struct sctp_transport *sctp_transport_init(struct net *net, get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); - - return peer; } /* Allocate and initialize a new transport. */ @@ -96,20 +94,13 @@ struct sctp_transport *sctp_transport_new(struct net *net, transport = kzalloc(sizeof(*transport), gfp); if (!transport) - goto fail; + return NULL; - if (!sctp_transport_init(net, transport, addr, gfp)) - goto fail_init; + sctp_transport_init(net, transport, addr, gfp); SCTP_DBG_OBJCNT_INC(transport); return transport; - -fail_init: - kfree(transport); - -fail: - return NULL; } /* This transport is no longer needed. Free up if possible, or @@ -495,6 +486,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; + unsigned int rto_beta, rto_alpha; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' @@ -506,10 +498,14 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ - tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) - + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); - tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) - + (rtt >> net->sctp.rto_alpha); + rto_beta = READ_ONCE(net->sctp.rto_beta); + if (rto_beta < 32) + tp->rttvar = tp->rttvar - (tp->rttvar >> rto_beta) + + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> rto_beta); + rto_alpha = READ_ONCE(net->sctp.rto_alpha); + if (rto_alpha < 32) + tp->srtt = tp->srtt - (tp->srtt >> rto_alpha) + + (rtt >> rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 157aace169d4..87c87edadde7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -890,6 +890,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; + ini->smc_type_v1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + diff --git a/net/socket.c b/net/socket.c index e8892b218708..e1bf93508f05 100644 --- a/net/socket.c +++ b/net/socket.c @@ -503,21 +503,12 @@ EXPORT_SYMBOL(sock_alloc_file); static int sock_map_fd(struct socket *sock, int flags) { - struct file *newfile; - int fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) { - sock_release(sock); - return fd; - } - - newfile = sock_alloc_file(sock, flags, NULL); - if (!IS_ERR(newfile)) { - fd_install(fd, newfile); - return fd; - } + int fd; - put_unused_fd(fd); - return PTR_ERR(newfile); + fd = FD_ADD(flags, sock_alloc_file(sock, flags, NULL)); + if (fd < 0) + sock_release(sock); + return fd; } /** @@ -2012,8 +2003,6 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s int __user *upeer_addrlen, int flags) { struct proto_accept_arg arg = { }; - struct file *newfile; - int newfd; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; @@ -2021,18 +2010,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - newfd = get_unused_fd_flags(flags); - if (unlikely(newfd < 0)) - return newfd; - - newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, - flags); - if (IS_ERR(newfile)) { - put_unused_fd(newfd); - return PTR_ERR(newfile); - } - fd_install(newfd, newfile); - return newfd; + return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags)); } /* diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 43b1f558b33d..e659fea2da70 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -238,7 +238,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, strp_parser_err(strp, -EMSGSIZE, desc); break; } else if (len <= (ssize_t)head->len - - skb->len - stm->strp.offset) { + (ssize_t)skb->len - stm->strp.offset) { /* Length must be into new skb (and also * greater than zero) */ diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 984e0cf9bf8a..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,10 +18,9 @@ config SUNRPC_SWAP config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" - depends on SUNRPC + depends on SUNRPC && CRYPTO default y select SUNRPC_GSS - select CRYPTO select CRYPTO_SKCIPHER select CRYPTO_HASH help diff --git a/net/tipc/net.c b/net/tipc/net.c index 0e95572e56b4..7e65d0b0c4a8 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -145,7 +145,9 @@ void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); + rtnl_lock(); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); + rtnl_unlock(); } void tipc_net_stop(struct net *net) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 768098dec231..45a606c013fc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1210,25 +1210,16 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, unix_mkname_bsd(sunaddr, addr_len); if (flags & SOCK_COREDUMP) { - const struct cred *cred; - struct cred *kcred; struct path root; - kcred = prepare_kernel_cred(&init_task); - if (!kcred) { - err = -ENOMEM; - goto fail; - } - task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cred = override_creds(kcred); - err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, - LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | - LOOKUP_NO_MAGICLINKS, &path); - put_cred(revert_creds(cred)); + scoped_with_kernel_creds() + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); path_put(&root); if (err) goto fail; @@ -1399,7 +1390,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); @@ -2954,6 +2945,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, u = unix_sk(sk); +redo: /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2965,7 +2957,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, struct sk_buff *skb, *last; int chunk; -redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -3015,7 +3006,6 @@ again: goto out; } - mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); @@ -3286,9 +3276,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct file *f; - int fd; - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -3298,18 +3285,7 @@ static int unix_open_file(struct sock *sk) if (!unix_sk(sk)->path.dentry) return -ENOENT; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); - if (IS_ERR(f)) { - put_unused_fd(fd); - return PTR_ERR(f); - } - - fd_install(fd, f); - return fd; + return FD_ADD(O_CLOEXEC, dentry_open(&unix_sk(sk)->path, O_PATH, current_cred())); } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 684ab03137b6..65396a4e1b07 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -145,6 +145,7 @@ enum unix_vertex_index { }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; +static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { @@ -153,6 +154,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; + vertex->scc_index = ++unix_vertex_max_scc_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); @@ -489,10 +491,15 @@ prev_vertex: scc_dead = unix_vertex_dead(v); } - if (scc_dead) + if (scc_dead) { unix_collect_skb(&scc, hitlist); - else if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } else { + if (unix_vertex_max_scc_index < vertex->scc_index) + unix_vertex_max_scc_index = vertex->scc_index; + + if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } list_del(&scc); } @@ -507,6 +514,7 @@ static void unix_walk_scc(struct sk_buff_head *hitlist) unsigned long last_index = UNIX_VERTEX_INDEX_START; unix_graph_maybe_cyclic = false; + unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 76763247a377..a9ca9c3b87b3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1661,18 +1661,40 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = schedule_timeout(timeout); lock_sock(sk); - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; - sock->state = SS_UNCONNECTED; - vsock_transport_cancel_pkt(vsk); - vsock_remove_connected(vsk); - goto out_wait; - } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { - err = -ETIMEDOUT; + /* Connection established. Whatever happens to socket once we + * release it, that's not connect()'s concern. No need to go + * into signal and timeout handling. Call it a day. + * + * Note that allowing to "reset" an already established socket + * here is racy and insecure. + */ + if (sk->sk_state == TCP_ESTABLISHED) + break; + + /* If connection was _not_ established and a signal/timeout came + * to be, we want the socket's state reset. User space may want + * to retry. + * + * sk_state != TCP_ESTABLISHED implies that socket is not on + * vsock_connected_table. We keep the binding and the transport + * assigned. + */ + if (signal_pending(current) || timeout == 0) { + err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); + + /* Listener might have already responded with + * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our + * sk_state == TCP_SYN_SENT, which hereby we break. + * In such case VIRTIO_VSOCK_OP_RST will follow. + */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; + + /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by + * transport->connect(). + */ vsock_transport_cancel_pkt(vsk); + goto out_wait; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 797f9f2004a6..54a34d8d356e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1787,6 +1787,62 @@ bool wiphy_delayed_work_pending(struct wiphy *wiphy, } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t) +{ + struct wiphy_hrtimer_work *hrwork = + container_of(t, struct wiphy_hrtimer_work, timer); + + wiphy_work_queue(hrwork->wiphy, &hrwork->work); + + return HRTIMER_NORESTART; +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_timer); + +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay) +{ + trace_wiphy_hrtimer_work_queue(wiphy, &hrwork->work, delay); + + if (!delay) { + hrtimer_cancel(&hrwork->timer); + wiphy_work_queue(wiphy, &hrwork->work); + return; + } + + hrwork->wiphy = wiphy; + hrtimer_start_range_ns(&hrwork->timer, delay, + 1000 * NSEC_PER_USEC, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_queue); + +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_cancel(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_cancel); + +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_flush(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_flush); + +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + return hrtimer_is_queued(&hrwork->timer); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_pending); + static int __init cfg80211_init(void) { int err; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 8a4c34112eb5..2b71f1d867a0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -304,6 +304,27 @@ TRACE_EVENT(wiphy_delayed_work_queue, __entry->delay) ); +TRACE_EVENT(wiphy_hrtimer_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, + ktime_t delay), + TP_ARGS(wiphy, work, delay), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + __field(ktime_t, delay) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work->func; + __entry->delay = delay; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu", + WIPHY_PR_ARG, __entry->instance, __entry->func, + __entry->delay) +); + TRACE_EVENT(wiphy_work_worker_start, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7b0c68a70888..69bbcca8ac75 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,20 +36,13 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 -struct xsk_addr_node { - u64 addr; - struct list_head addr_node; -}; - -struct xsk_addr_head { +struct xsk_addrs { u32 num_descs; - struct list_head addrs_list; + u64 addrs[MAX_SKB_FRAGS + 1]; }; static struct kmem_cache *xsk_tx_generic_cache; -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) - void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -558,29 +551,68 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) return ret; } +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) +{ + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; +} + +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) +{ + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); +} + +static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); +} + +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr->num_descs++; + } +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) + return 1; + + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + return xsk_addr->num_descs; +} + static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, struct sk_buff *skb) { - struct xsk_addr_node *pos, *tmp; + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; u32 descs_processed = 0; unsigned long flags; - u32 idx; + u32 idx, i; spin_lock_irqsave(&pool->cq_lock, flags); idx = xskq_get_prod(pool->cq); - xskq_prod_write_addr(pool->cq, idx, - (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); - descs_processed++; + if (unlikely(num_descs > 1)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - if (unlikely(XSKCB(skb)->num_descs > 1)) { - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + for (i = 0; i < num_descs; i++) { xskq_prod_write_addr(pool->cq, idx + descs_processed, - pos->addr); + xsk_addr->addrs[i]); descs_processed++; - list_del(&pos->addr_node); - kmem_cache_free(xsk_tx_generic_cache, pos); } + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } else { + xskq_prod_write_addr(pool->cq, idx, + xsk_skb_destructor_get_addr(skb)); + descs_processed++; } xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); @@ -595,16 +627,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) spin_unlock_irqrestore(&pool->cq_lock, flags); } -static void xsk_inc_num_desc(struct sk_buff *skb) -{ - XSKCB(skb)->num_descs++; -} - -static u32 xsk_get_num_desc(struct sk_buff *skb) -{ - return XSKCB(skb)->num_descs; -} - static void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; @@ -621,27 +643,22 @@ static void xsk_destruct_skb(struct sk_buff *skb) static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, u64 addr) { - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); - XSKCB(skb)->num_descs = 0; skb->destructor = xsk_destruct_skb; - skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; + xsk_skb_destructor_set_addr(skb, addr); } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); u32 num_descs = xsk_get_num_desc(skb); - struct xsk_addr_node *pos, *tmp; + struct xsk_addrs *xsk_addr; if (unlikely(num_descs > 1)) { - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { - list_del(&pos->addr_node); - kmem_cache_free(xsk_tx_generic_cache, pos); - } + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } skb->destructor = sock_wfree; @@ -701,7 +718,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct xsk_addr_node *xsk_addr; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; @@ -727,16 +743,26 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); } } else { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); - if (!xsk_addr) - return ERR_PTR(-ENOMEM); + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb * would be dropped, which implies freeing all list elements */ - xsk_addr->addr = desc->addr; - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } len = desc->len; @@ -813,10 +839,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } else { int nr_frags = skb_shinfo(skb)->nr_frags; - struct xsk_addr_node *xsk_addr; + struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; + } + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; @@ -828,13 +869,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); - if (!xsk_addr) { - __free_page(page); - err = -ENOMEM; - goto free_err; - } - vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); @@ -842,8 +876,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); - xsk_addr->addr = desc->addr; - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } @@ -1904,7 +1937,7 @@ static int __init xsk_init(void) goto out_pernet; xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", - sizeof(struct xsk_addr_node), + sizeof(struct xsk_addrs), 0, SLAB_HWCACHE_ALIGN, NULL); if (!xsk_tx_generic_cache) { err = -ENOMEM; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 44b9de6e4e77..52ae0e034d29 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -438,7 +438,7 @@ ok: check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->inner_mode.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9077730ff7d0..54222fcbd7fd 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -698,7 +698,7 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; if (x->outer_mode.encap == XFRM_MODE_TUNNEL) { - switch (x->outer_mode.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: xo->inner_ipproto = ip_hdr(skb)->protocol; break; @@ -772,8 +772,12 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) /* Exclusive direct xmit for tunnel mode, as * some filtering or matching rules may apply * in transport mode. + * Locally generated packets also require + * the normal XFRM path for L2 header setup, + * as the hardware needs the L2 header to match + * for encryption, so skip direct output as well. */ - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL && !skb->sk) return xfrm_dev_direct_output(sk, x, skb); return xfrm_output_resume(sk, skb, 0); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d213ca3653a8..9e14e453b55c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,6 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) @@ -607,6 +608,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->replay_esn); kfree(x->preplay_esn); xfrm_unset_type_offload(x); + xfrm_state_delete_tunnel(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -806,7 +808,6 @@ void __xfrm_state_destroy(struct xfrm_state *x) } EXPORT_SYMBOL(__xfrm_state_destroy); -static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -2073,6 +2074,7 @@ static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig, return x; error: + x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); out: return NULL; @@ -2157,11 +2159,15 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, xfrm_state_insert(xc); } else { if (xfrm_state_add(xc) < 0) - goto error; + goto error_add; } return xc; +error_add: + if (xuo) + xfrm_dev_state_delete(xc); error: + xc->km.state = XFRM_STATE_DEAD; xfrm_state_put(xc); return NULL; } @@ -2191,14 +2197,18 @@ int xfrm_state_update(struct xfrm_state *x) } if (x1->km.state == XFRM_STATE_ACQ) { - if (x->dir && x1->dir != x->dir) + if (x->dir && x1->dir != x->dir) { + to_put = x1; goto out; + } __xfrm_state_insert(x); x = NULL; } else { - if (x1->dir != x->dir) + if (x1->dir != x->dir) { + to_put = x1; goto out; + } } err = 0; @@ -3298,6 +3308,7 @@ out_bydst: void xfrm_state_fini(struct net *net) { unsigned int sz; + int i; flush_work(&net->xfrm.state_hash_work); xfrm_state_flush(net, 0, false); @@ -3305,14 +3316,17 @@ void xfrm_state_fini(struct net *net) WARN_ON(!list_empty(&net->xfrm.state_all)); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + WARN_ON(!hlist_empty(net->xfrm.state_byseq + i)); + WARN_ON(!hlist_empty(net->xfrm.state_byspi + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bydst + i)); + } + sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); - WARN_ON(!hlist_empty(net->xfrm.state_byseq)); xfrm_hash_free(net->xfrm.state_byseq, sz); - WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); xfrm_hash_free(net->xfrm.state_bysrc, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); free_percpu(net->xfrm.state_cache_input); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 010c9e6638c0..403b5ecac2c5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -947,8 +947,11 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_PCPU]) { x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); - if (x->pcpu_num >= num_possible_cpus()) + if (x->pcpu_num >= num_possible_cpus()) { + err = -ERANGE; + NL_SET_ERR_MSG(extack, "pCPU number too big"); goto error; + } } err = __xfrm_init_state(x, extack); @@ -3035,6 +3038,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, } xfrm_state_free(x); + xfrm_dev_policy_delete(xp); + xfrm_dev_policy_free(xp); + security_xfrm_policy_free(xp->security); kfree(xp); return 0; diff --git a/rust/Makefile b/rust/Makefile index 23c7ae905bd2..7842ad0a4ea7 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -69,6 +69,9 @@ core-edition := $(if $(call rustc-min-version,108700),2024,2021) # the time being (https://github.com/rust-lang/rust/issues/144521). rustdoc_modifiers_workaround := $(if $(call rustc-min-version,108800),-Cunsafe-allow-abi-mismatch=fixed-x18) +# Similarly, for doctests (https://github.com/rust-lang/rust/issues/146465). +doctests_modifiers_workaround := $(rustdoc_modifiers_workaround)$(if $(call rustc-min-version,109100),$(comma)sanitizer) + # `rustc` recognizes `--remap-path-prefix` since 1.26.0, but `rustdoc` only # since Rust 1.81.0. Moreover, `rustdoc` ICEs on out-of-tree builds since Rust # 1.82.0 (https://github.com/rust-lang/rust/issues/138520). Thus workaround both @@ -127,9 +130,14 @@ rustdoc-core: private rustc_target_flags = --edition=$(core-edition) $(core-cfgs rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs rustdoc-clean FORCE +$(call if_changed,rustdoc) +# Even if `rustdoc` targets are not kernel objects, they should still be +# treated as such so that we pass the same flags. Otherwise, for instance, +# `rustdoc` will complain about missing sanitizer flags causing an ABI mismatch. +rustdoc-compiler_builtins: private is-kernel-object := y rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE +$(call if_changed,rustdoc) +rustdoc-ffi: private is-kernel-object := y rustdoc-ffi: $(src)/ffi.rs rustdoc-core FORCE +$(call if_changed,rustdoc) @@ -147,6 +155,7 @@ rustdoc-pin_init: $(src)/pin-init/src/lib.rs rustdoc-pin_init_internal \ rustdoc-macros FORCE +$(call if_changed,rustdoc) +rustdoc-kernel: private is-kernel-object := y rustdoc-kernel: private rustc_target_flags = --extern ffi --extern pin_init \ --extern build_error --extern macros \ --extern bindings --extern uapi @@ -230,7 +239,7 @@ quiet_cmd_rustdoc_test_kernel = RUSTDOC TK $< --extern bindings --extern uapi \ --no-run --crate-name kernel -Zunstable-options \ --sysroot=/dev/null \ - $(rustdoc_modifiers_workaround) \ + $(doctests_modifiers_workaround) \ --test-builder $(objtree)/scripts/rustdoc_test_builder \ $< $(rustdoc_test_kernel_quiet); \ $(objtree)/scripts/rustdoc_test_gen @@ -289,7 +298,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ -fstrict-flex-arrays=% -fmin-function-alignment=% \ -fzero-init-padding-bits=% -mno-fdpic \ - --param=% --param asan-% + --param=% --param asan-% -fno-isolate-erroneous-paths-dereference # Derived from `scripts/Makefile.clang`. BINDGEN_TARGET_x86 := x86_64-linux-gnu @@ -522,6 +531,10 @@ $(obj)/pin_init.o: $(src)/pin-init/src/lib.rs $(obj)/compiler_builtins.o \ $(obj)/$(libpin_init_internal_name) $(obj)/$(libmacros_name) FORCE +$(call if_changed_rule,rustc_library) +# Even if normally `build_error` is not a kernel object, it should still be +# treated as such so that we pass the same flags. Otherwise, for instance, +# `rustc` will complain about missing sanitizer flags causing an ABI mismatch. +$(obj)/build_error.o: private is-kernel-object := y $(obj)/build_error.o: private skip_gendwarfksyms = 1 $(obj)/build_error.o: $(src)/build_error.rs $(obj)/compiler_builtins.o FORCE +$(call if_changed_rule,rustc_library) diff --git a/rust/kernel/debugfs/traits.rs b/rust/kernel/debugfs/traits.rs index ab009eb254b3..92054fed2136 100644 --- a/rust/kernel/debugfs/traits.rs +++ b/rust/kernel/debugfs/traits.rs @@ -4,14 +4,11 @@ //! Traits for rendering or updating values exported to DebugFS. use crate::prelude::*; +use crate::sync::atomic::{Atomic, AtomicBasicOps, AtomicType, Relaxed}; use crate::sync::Mutex; use crate::uaccess::UserSliceReader; use core::fmt::{self, Debug, Formatter}; use core::str::FromStr; -use core::sync::atomic::{ - AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, AtomicU32, AtomicU64, - AtomicU8, AtomicUsize, Ordering, -}; /// A trait for types that can be written into a string. /// @@ -50,7 +47,7 @@ pub trait Reader { fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result; } -impl<T: FromStr> Reader for Mutex<T> { +impl<T: FromStr + Unpin> Reader for Mutex<T> { fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { let mut buf = [0u8; 128]; if reader.len() > buf.len() { @@ -66,37 +63,21 @@ impl<T: FromStr> Reader for Mutex<T> { } } -macro_rules! impl_reader_for_atomic { - ($(($atomic_type:ty, $int_type:ty)),*) => { - $( - impl Reader for $atomic_type { - fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { - let mut buf = [0u8; 21]; // Enough for a 64-bit number. - if reader.len() > buf.len() { - return Err(EINVAL); - } - let n = reader.len(); - reader.read_slice(&mut buf[..n])?; +impl<T: AtomicType + FromStr> Reader for Atomic<T> +where + T::Repr: AtomicBasicOps, +{ + fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result { + let mut buf = [0u8; 21]; // Enough for a 64-bit number. + if reader.len() > buf.len() { + return Err(EINVAL); + } + let n = reader.len(); + reader.read_slice(&mut buf[..n])?; - let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; - let val = s.trim().parse::<$int_type>().map_err(|_| EINVAL)?; - self.store(val, Ordering::Relaxed); - Ok(()) - } - } - )* - }; + let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?; + let val = s.trim().parse::<T>().map_err(|_| EINVAL)?; + self.store(val, Relaxed); + Ok(()) + } } - -impl_reader_for_atomic!( - (AtomicI16, i16), - (AtomicI32, i32), - (AtomicI64, i64), - (AtomicI8, i8), - (AtomicIsize, isize), - (AtomicU16, u16), - (AtomicU32, u32), - (AtomicU64, u64), - (AtomicU8, u8), - (AtomicUsize, usize) -); diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs index 10a6a1789854..2392c281459e 100644 --- a/rust/kernel/devres.rs +++ b/rust/kernel/devres.rs @@ -103,7 +103,7 @@ struct Inner<T: Send> { /// /// # Invariants /// -/// [`Self::inner`] is guaranteed to be initialized and is always accessed read-only. +/// `Self::inner` is guaranteed to be initialized and is always accessed read-only. #[pin_data(PinnedDrop)] pub struct Devres<T: Send> { dev: ARef<Device>, diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs index 016a6bcaf080..3afc376be42d 100644 --- a/rust/kernel/sync/atomic.rs +++ b/rust/kernel/sync/atomic.rs @@ -22,9 +22,10 @@ mod predefine; pub use internal::AtomicImpl; pub use ordering::{Acquire, Full, Relaxed, Release}; +pub(crate) use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps}; use crate::build_error; -use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr}; +use internal::AtomicRepr; use ordering::OrderingType; /// A memory location which can be safely modified from multiple execution contexts. @@ -306,6 +307,15 @@ where } } +impl<T: AtomicType + core::fmt::Debug> core::fmt::Debug for Atomic<T> +where + T::Repr: AtomicBasicOps, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + core::fmt::Debug::fmt(&self.load(Relaxed), f) + } +} + impl<T: AtomicType> Atomic<T> where T::Repr: AtomicExchangeOps, diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index c6ec64295c9f..aa5b9a7a726d 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -36,7 +36,7 @@ pub use new_condvar; /// spuriously. /// /// Instances of [`CondVar`] need a lock class and to be pinned. The recommended way to create such -/// instances is with the [`pin_init`](crate::pin_init!) and [`new_condvar`] macros. +/// instances is with the [`pin_init`](pin_init::pin_init!) and [`new_condvar`] macros. /// /// # Examples /// diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs index 27202beef90c..cb00fdb94ffd 100644 --- a/rust/kernel/sync/lock.rs +++ b/rust/kernel/sync/lock.rs @@ -11,7 +11,7 @@ use crate::{ types::{NotThreadSafe, Opaque, ScopeGuard}, }; use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; -use pin_init::{pin_data, pin_init, PinInit}; +use pin_init::{pin_data, pin_init, PinInit, Wrapper}; pub mod mutex; pub mod spinlock; @@ -115,6 +115,7 @@ pub struct Lock<T: ?Sized, B: Backend> { _pin: PhantomPinned, /// The data protected by the lock. + #[pin] pub(crate) data: UnsafeCell<T>, } @@ -127,9 +128,13 @@ unsafe impl<T: ?Sized + Send, B: Backend> Sync for Lock<T, B> {} impl<T, B: Backend> Lock<T, B> { /// Constructs a new lock initialiser. - pub fn new(t: T, name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> { + pub fn new( + t: impl PinInit<T>, + name: &'static CStr, + key: Pin<&'static LockClassKey>, + ) -> impl PinInit<Self> { pin_init!(Self { - data: UnsafeCell::new(t), + data <- UnsafeCell::pin_init(t), _pin: PhantomPinned, // SAFETY: `slot` is valid while the closure is called and both `name` and `key` have // static lifetimes so they live indefinitely. @@ -240,6 +245,31 @@ impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> { cb() } + + /// Returns a pinned mutable reference to the protected data. + /// + /// The guard implements [`DerefMut`] when `T: Unpin`, so for [`Unpin`] + /// types [`DerefMut`] should be used instead of this function. + /// + /// [`DerefMut`]: core::ops::DerefMut + /// [`Unpin`]: core::marker::Unpin + /// + /// # Examples + /// + /// ``` + /// # use kernel::sync::{Mutex, MutexGuard}; + /// # use core::{pin::Pin, marker::PhantomPinned}; + /// struct Data(PhantomPinned); + /// + /// fn example(mutex: &Mutex<Data>) { + /// let mut data: MutexGuard<'_, Data> = mutex.lock(); + /// let mut data: Pin<&mut Data> = data.as_mut(); + /// } + /// ``` + pub fn as_mut(&mut self) -> Pin<&mut T> { + // SAFETY: `self.lock.data` is structurally pinned. + unsafe { Pin::new_unchecked(&mut *self.lock.data.get()) } + } } impl<T: ?Sized, B: Backend> core::ops::Deref for Guard<'_, T, B> { @@ -251,7 +281,10 @@ impl<T: ?Sized, B: Backend> core::ops::Deref for Guard<'_, T, B> { } } -impl<T: ?Sized, B: Backend> core::ops::DerefMut for Guard<'_, T, B> { +impl<T: ?Sized, B: Backend> core::ops::DerefMut for Guard<'_, T, B> +where + T: Unpin, +{ fn deref_mut(&mut self) -> &mut Self::Target { // SAFETY: The caller owns the lock, so it is safe to deref the protected data. unsafe { &mut *self.lock.data.get() } diff --git a/rust/kernel/sync/lock/global.rs b/rust/kernel/sync/lock/global.rs index d65f94b5caf2..38b448032799 100644 --- a/rust/kernel/sync/lock/global.rs +++ b/rust/kernel/sync/lock/global.rs @@ -106,7 +106,10 @@ impl<B: GlobalLockBackend> core::ops::Deref for GlobalGuard<B> { } } -impl<B: GlobalLockBackend> core::ops::DerefMut for GlobalGuard<B> { +impl<B: GlobalLockBackend> core::ops::DerefMut for GlobalGuard<B> +where + B::Item: Unpin, +{ fn deref_mut(&mut self) -> &mut Self::Target { &mut self.inner } diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs index 82b61a15a34b..711faa07bece 100644 --- a/samples/rust/rust_debugfs.rs +++ b/samples/rust/rust_debugfs.rs @@ -32,14 +32,12 @@ //! ``` use core::str::FromStr; -use core::sync::atomic::AtomicUsize; -use core::sync::atomic::Ordering; use kernel::c_str; use kernel::debugfs::{Dir, File}; use kernel::new_mutex; use kernel::prelude::*; +use kernel::sync::atomic::{Atomic, Relaxed}; use kernel::sync::Mutex; - use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef}; kernel::module_platform_driver! { @@ -59,7 +57,7 @@ struct RustDebugFs { #[pin] _compatible: File<CString>, #[pin] - counter: File<AtomicUsize>, + counter: File<Atomic<usize>>, #[pin] inner: File<Mutex<Inner>>, } @@ -109,7 +107,7 @@ impl platform::Driver for RustDebugFs { ) -> Result<Pin<KBox<Self>>> { let result = KBox::try_pin_init(RustDebugFs::new(pdev), GFP_KERNEL)?; // We can still mutate fields through the files which are atomic or mutexed: - result.counter.store(91, Ordering::Relaxed); + result.counter.store(91, Relaxed); { let mut guard = result.inner.lock(); guard.x = guard.y; @@ -120,8 +118,8 @@ impl platform::Driver for RustDebugFs { } impl RustDebugFs { - fn build_counter(dir: &Dir) -> impl PinInit<File<AtomicUsize>> + '_ { - dir.read_write_file(c_str!("counter"), AtomicUsize::new(0)) + fn build_counter(dir: &Dir) -> impl PinInit<File<Atomic<usize>>> + '_ { + dir.read_write_file(c_str!("counter"), Atomic::<usize>::new(0)) } fn build_inner(dir: &Dir) -> impl PinInit<File<Mutex<Inner>>> + '_ { diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs index b0c4e76b123e..9f0ec5f24cda 100644 --- a/samples/rust/rust_debugfs_scoped.rs +++ b/samples/rust/rust_debugfs_scoped.rs @@ -6,9 +6,9 @@ //! `Scope::dir` to create a variety of files without the need to separately //! track them all. -use core::sync::atomic::AtomicUsize; use kernel::debugfs::{Dir, Scope}; use kernel::prelude::*; +use kernel::sync::atomic::Atomic; use kernel::sync::Mutex; use kernel::{c_str, new_mutex, str::CString}; @@ -62,7 +62,7 @@ fn create_file_write( let file_name = CString::try_from_fmt(fmt!("{name_str}"))?; for sub in items { nums.push( - AtomicUsize::new(sub.parse().map_err(|_| EINVAL)?), + Atomic::<usize>::new(sub.parse().map_err(|_| EINVAL)?), GFP_KERNEL, )?; } @@ -109,7 +109,7 @@ impl ModuleData { struct DeviceData { name: CString, - nums: KVec<AtomicUsize>, + nums: KVec<Atomic<usize>>, } fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit<Scope<ModuleData>> + '_ { diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c index 49c7a46cee07..424a6fa15723 100644 --- a/samples/vfs/test-statx.c +++ b/samples/vfs/test-statx.c @@ -19,6 +19,12 @@ #include <time.h> #include <sys/syscall.h> #include <sys/types.h> + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + #include <linux/stat.h> #include <linux/fcntl.h> #define statx foo diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c index 8c6cb57d5cfc..24cf7d7a1972 100644 --- a/samples/watch_queue/watch_test.c +++ b/samples/watch_queue/watch_test.c @@ -16,6 +16,12 @@ #include <errno.h> #include <sys/ioctl.h> #include <limits.h> + +// Work around glibc header silliness +#undef AT_RENAME_NOREPLACE +#undef AT_RENAME_EXCHANGE +#undef AT_RENAME_WHITEOUT + #include <linux/watch_queue.h> #include <linux/unistd.h> #include <linux/keyctl.h> diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d0ee33a487be..52c08c4eb0b9 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -167,7 +167,7 @@ else ifeq ($(KBUILD_CHECKSRC),2) endif ifneq ($(KBUILD_EXTRA_WARN),) - cmd_checkdoc = PYTHONDONTWRITEBYTECODE=1 $(KERNELDOC) -none $(KDOCFLAGS) \ + cmd_checkdoc = PYTHONDONTWRITEBYTECODE=1 $(PYTHON3) $(KERNELDOC) -none $(KDOCFLAGS) \ $(if $(findstring 2, $(KBUILD_EXTRA_WARN)), -Wall) \ $< endif diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 6af392f9cd02..68e6fafcb80c 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -28,8 +28,10 @@ endif KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds ifdef CONFIG_CC_IS_CLANG -# The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. +# The kernel builds with '-std=gnu11' and '-fms-extensions' so use of GNU and +# Microsoft extensions is acceptable. KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag # Clang checks for overflow/truncation with '%p', while GCC does not: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 1d581ba5df66..28a1c08e3b22 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -20,7 +20,7 @@ name-fix-token = $(subst $(comma),_,$(subst -,_,$1)) name-fix = $(call stringify,$(call name-fix-token,$1)) basename_flags = -DKBUILD_BASENAME=$(call name-fix,$(basetarget)) modname_flags = -DKBUILD_MODNAME=$(call name-fix,$(modname)) \ - -D__KBUILD_MODNAME=kmod_$(call name-fix-token,$(modname)) + -D__KBUILD_MODNAME=$(call name-fix-token,$(modname)) modfile_flags = -DKBUILD_MODFILE=$(call stringify,$(modfile)) _c_flags = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \ @@ -191,13 +191,13 @@ objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess objtool-args-$(or $(CONFIG_GCOV_KERNEL),$(CONFIG_KCOV)) += --no-unreachable objtool-args-$(CONFIG_PREFIX_SYMBOLS) += --prefix=$(CONFIG_FUNCTION_PADDING_BYTES) -objtool-args-$(CONFIG_OBJTOOL_WERROR) += --Werror +objtool-args-$(CONFIG_OBJTOOL_WERROR) += --werror objtool-args = $(objtool-args-y) \ $(if $(delay-objtool), --link) \ $(if $(part-of-module), --module) -delay-objtool := $(or $(CONFIG_LTO_CLANG),$(CONFIG_X86_KERNEL_IBT)) +delay-objtool := $(or $(CONFIG_LTO_CLANG),$(CONFIG_X86_KERNEL_IBT),$(CONFIG_KLP_BUILD)) cmd_objtool = $(if $(objtool-enabled), ; $(objtool) $(objtool-args) $@) cmd_gen_objtooldep = $(if $(objtool-enabled), { echo ; echo '$@: $$(wildcard $(objtool))' ; } >> $(dot-target).cmd) diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index ced4379550d7..cd788cac9d91 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -102,11 +102,24 @@ vmlinux: vmlinux.unstripped FORCE # modules.builtin.modinfo # --------------------------------------------------------------------------- +# .modinfo in vmlinux.unstripped is aligned to 8 bytes for compatibility with +# tools that expect vmlinux to have sufficiently aligned sections but the +# additional bytes used for padding .modinfo to satisfy this requirement break +# certain versions of kmod with +# +# depmod: ERROR: kmod_builtin_iter_next: unexpected string without modname prefix +# +# Strip the trailing padding bytes after extracting .modinfo to comply with +# what kmod expects to parse. +quiet_cmd_modules_builtin_modinfo = GEN $@ + cmd_modules_builtin_modinfo = $(cmd_objcopy); \ + sed -i 's/\x00\+$$/\x00/g' $@ + OBJCOPYFLAGS_modules.builtin.modinfo := -j .modinfo -O binary targets += modules.builtin.modinfo modules.builtin.modinfo: vmlinux.unstripped FORCE - $(call if_changed,objcopy) + $(call if_changed,modules_builtin_modinfo) # modules.builtin # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 23c8751285d7..527352c222ff 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -41,7 +41,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) ifeq ($(delay-objtool),y) vmlinux-objtool-args-y += $(objtool-args-y) else -vmlinux-objtool-args-$(CONFIG_OBJTOOL_WERROR) += --Werror +vmlinux-objtool-args-$(CONFIG_OBJTOOL_WERROR) += --werror endif vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION) += --noinstr \ @@ -63,11 +63,15 @@ quiet_cmd_ld_vmlinux.o = LD $@ --start-group $(KBUILD_VMLINUX_LIBS) --end-group \ $(cmd_objtool) +cmd_check_function_names = $(srctree)/scripts/check-function-names.sh $@ + define rule_ld_vmlinux.o $(call cmd_and_savecmd,ld_vmlinux.o) $(call cmd,gen_objtooldep) + $(call cmd,check_function_names) endef + vmlinux.o: $(initcalls-lds) vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE $(call if_changed_rule,ld_vmlinux.o) diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh index 592f3ec89b5f..9c1d53f81eb2 100755 --- a/scripts/atomic/gen-atomic-instrumented.sh +++ b/scripts/atomic/gen-atomic-instrumented.sh @@ -12,7 +12,7 @@ gen_param_check() local arg="$1"; shift local type="${arg%%:*}" local name="$(gen_param_name "${arg}")" - local rw="write" + local rw="atomic_write" case "${type#c}" in i) return;; @@ -20,14 +20,17 @@ gen_param_check() if [ ${type#c} != ${type} ]; then # We don't write to constant parameters. - rw="read" + rw="atomic_read" + elif [ "${type}" = "p" ] ; then + # The "old" argument in try_cmpxchg() gets accessed non-atomically + rw="read_write" elif [ "${meta}" != "s" ]; then # An atomic RMW: if this parameter is not a constant, and this atomic is # not just a 's'tore, this parameter is both read from and written to. - rw="read_write" + rw="atomic_read_write" fi - printf "\tinstrument_atomic_${rw}(${name}, sizeof(*${name}));\n" + printf "\tinstrument_${rw}(${name}, sizeof(*${name}));\n" } #gen_params_checks(meta, arg...) diff --git a/scripts/check-function-names.sh b/scripts/check-function-names.sh new file mode 100755 index 000000000000..410042591cfc --- /dev/null +++ b/scripts/check-function-names.sh @@ -0,0 +1,25 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Certain function names are disallowed due to section name ambiguities +# introduced by -ffunction-sections. +# +# See the comment above TEXT_MAIN in include/asm-generic/vmlinux.lds.h. + +objfile="$1" + +if [ ! -f "$objfile" ]; then + echo "usage: $0 <file.o>" >&2 + exit 1 +fi + +bad_symbols=$(nm "$objfile" | awk '$2 ~ /^[TtWw]$/ {print $3}' | grep -E '^(startup|exit|split|unlikely|hot|unknown)(\.|$)') + +if [ -n "$bad_symbols" ]; then + echo "$bad_symbols" | while read -r sym; do + echo "$objfile: error: $sym() function name creates ambiguity with -ffunction-sections" >&2 + done + exit 1 +fi + +exit 0 diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index c73cb802a0a3..8d01b741de62 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -277,12 +277,6 @@ handle_line() { fi done - if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then - words[$last-1]="${words[$last-1]} ${words[$last]}" - unset words[$last] spaces[$last] - last=$(( $last - 1 )) - fi - # Extract info after the symbol if present. E.g.: # func_name+0x54/0x80 (P) # ^^^ @@ -295,6 +289,14 @@ handle_line() { last=$(( $last - 1 )) fi + # Join module name with its build id if present, as these were + # split during tokenization (e.g. "[module" and "modbuildid]"). + if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then + words[$last-1]="${words[$last-1]} ${words[$last]}" + unset words[$last] spaces[$last] + last=$(( $last - 1 )) + fi + if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then module=${words[$last]} # some traces format is "(%pS)", which like "(foo+0x0/0x1 [bar])" diff --git a/scripts/faddr2line b/scripts/faddr2line index 1fa6beef9f97..622875396bcf 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # SPDX-License-Identifier: GPL-2.0 # # Translate stack dump function offsets. @@ -76,6 +76,10 @@ ADDR2LINE="${UTIL_PREFIX}addr2line${UTIL_SUFFIX}" AWK="awk" GREP="grep" +# Enforce ASCII-only output from tools like readelf +# ensuring sed processes strings correctly. +export LANG=C + command -v ${AWK} >/dev/null 2>&1 || die "${AWK} isn't installed" command -v ${READELF} >/dev/null 2>&1 || die "${READELF} isn't installed" command -v ${ADDR2LINE} >/dev/null 2>&1 || die "${ADDR2LINE} isn't installed" @@ -107,14 +111,19 @@ find_dir_prefix() { run_readelf() { local objfile=$1 - local out=$(${READELF} --file-header --section-headers --symbols --wide $objfile) + local tmpfile + tmpfile=$(mktemp) + + ${READELF} --file-header --section-headers --symbols --wide "$objfile" > "$tmpfile" # This assumes that readelf first prints the file header, then the section headers, then the symbols. # Note: It seems that GNU readelf does not prefix section headers with the "There are X section headers" # line when multiple options are given, so let's also match with the "Section Headers:" line. - ELF_FILEHEADER=$(echo "${out}" | sed -n '/There are [0-9]* section headers, starting at offset\|Section Headers:/q;p') - ELF_SECHEADERS=$(echo "${out}" | sed -n '/There are [0-9]* section headers, starting at offset\|Section Headers:/,$p' | sed -n '/Symbol table .* contains [0-9]* entries:/q;p') - ELF_SYMS=$(echo "${out}" | sed -n '/Symbol table .* contains [0-9]* entries:/,$p') + ELF_FILEHEADER=$(sed -n '/There are [0-9]* section headers, starting at offset\|Section Headers:/q;p' "$tmpfile") + ELF_SECHEADERS=$(sed -n '/There are [0-9]* section headers, starting at offset\|Section Headers:/,$p' "$tmpfile" | sed -n '/Symbol table .* contains [0-9]* entries:/q;p') + ELF_SYMS=$(sed -n '/Symbol table .* contains [0-9]* entries:/,$p' "$tmpfile") + + rm -f -- "$tmpfile" } check_vmlinux() { diff --git a/scripts/gendwarfksyms/gendwarfksyms.c b/scripts/gendwarfksyms/gendwarfksyms.c index 08ae61eb327e..f5203d1640ee 100644 --- a/scripts/gendwarfksyms/gendwarfksyms.c +++ b/scripts/gendwarfksyms/gendwarfksyms.c @@ -138,7 +138,8 @@ int main(int argc, char **argv) error("no input files?"); } - symbol_read_exports(stdin); + if (!symbol_read_exports(stdin)) + return 0; if (symtypes_file) { symfile = fopen(symtypes_file, "w"); diff --git a/scripts/gendwarfksyms/gendwarfksyms.h b/scripts/gendwarfksyms/gendwarfksyms.h index d9c06d2cb1df..32cec8f7695a 100644 --- a/scripts/gendwarfksyms/gendwarfksyms.h +++ b/scripts/gendwarfksyms/gendwarfksyms.h @@ -123,7 +123,7 @@ struct symbol { typedef void (*symbol_callback_t)(struct symbol *, void *arg); bool is_symbol_ptr(const char *name); -void symbol_read_exports(FILE *file); +int symbol_read_exports(FILE *file); void symbol_read_symtab(int fd); struct symbol *symbol_get(const char *name); void symbol_set_ptr(struct symbol *sym, Dwarf_Die *ptr); diff --git a/scripts/gendwarfksyms/symbols.c b/scripts/gendwarfksyms/symbols.c index 35ed594f0749..ecddcb5ffcdf 100644 --- a/scripts/gendwarfksyms/symbols.c +++ b/scripts/gendwarfksyms/symbols.c @@ -128,7 +128,7 @@ static bool is_exported(const char *name) return for_each(name, NULL, NULL) > 0; } -void symbol_read_exports(FILE *file) +int symbol_read_exports(FILE *file) { struct symbol *sym; char *line = NULL; @@ -159,6 +159,8 @@ void symbol_read_exports(FILE *file) free(line); debug("%d exported symbols", nsym); + + return nsym; } static void get_symbol(struct symbol *sym, void *arg) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 433849ff7529..2df714ba51a9 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -60,7 +60,8 @@ vmlinux_link() # skip output file argument shift - if is_enabled CONFIG_LTO_CLANG || is_enabled CONFIG_X86_KERNEL_IBT; then + if is_enabled CONFIG_LTO_CLANG || is_enabled CONFIG_X86_KERNEL_IBT || + is_enabled CONFIG_KLP_BUILD; then # Use vmlinux.o instead of performing the slow LTO link again. objs=vmlinux.o libs= diff --git a/scripts/livepatch/fix-patch-lines b/scripts/livepatch/fix-patch-lines new file mode 100755 index 000000000000..fa7d4f6592e6 --- /dev/null +++ b/scripts/livepatch/fix-patch-lines @@ -0,0 +1,79 @@ +#!/usr/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# +# Use #line directives to preserve original __LINE__ numbers across patches to +# avoid unwanted compilation changes. + +BEGIN { + in_hunk = 0 + skip = 0 +} + +/^--- / { + skip = $2 !~ /\.(c|h)$/ + print + next +} + +/^@@/ { + if (skip) { + print + next + } + + in_hunk = 1 + + # @@ -1,3 +1,4 @@: + # 1: line number in old file + # 3: how many lines the hunk covers in old file + # 1: line number in new file + # 4: how many lines the hunk covers in new file + + match($0, /^@@ -([0-9]+)(,([0-9]+))? \+([0-9]+)(,([0-9]+))? @@/, m) + + # Set 'cur' to the old file's line number at the start of the hunk. It + # gets incremented for every context line and every line removal, so + # that it always represents the old file's current line number. + cur = m[1] + + # last = last line number of current hunk + last = cur + (m[3] ? m[3] : 1) - 1 + + need_line_directive = 0 + + print + next +} + +{ + if (skip || !in_hunk || $0 ~ /^\\ No newline at end of file/) { + print + next + } + + # change line + if ($0 ~ /^[+-]/) { + # inject #line after this group of changes + need_line_directive = 1 + + if ($0 ~ /^-/) + cur++ + + print + next + } + + # If this is the first context line after a group of changes, inject + # the #line directive to force the compiler to correct the line + # numbering to match the original file. + if (need_line_directive) { + print "+#line " cur + need_line_directive = 0 + } + + if (cur == last) + in_hunk = 0 + + cur++ + print +} diff --git a/scripts/livepatch/init.c b/scripts/livepatch/init.c new file mode 100644 index 000000000000..2274d8f5a482 --- /dev/null +++ b/scripts/livepatch/init.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Init code for a livepatch kernel module + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/livepatch.h> + +extern struct klp_object_ext __start_klp_objects[]; +extern struct klp_object_ext __stop_klp_objects[]; + +static struct klp_patch *patch; + +static int __init livepatch_mod_init(void) +{ + struct klp_object *objs; + unsigned int nr_objs; + int ret; + + nr_objs = __stop_klp_objects - __start_klp_objects; + + if (!nr_objs) { + pr_err("nothing to patch!\n"); + ret = -EINVAL; + goto err; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + ret = -ENOMEM; + goto err; + } + + objs = kzalloc(sizeof(struct klp_object) * (nr_objs + 1), GFP_KERNEL); + if (!objs) { + ret = -ENOMEM; + goto err_free_patch; + } + + for (int i = 0; i < nr_objs; i++) { + struct klp_object_ext *obj_ext = __start_klp_objects + i; + struct klp_func_ext *funcs_ext = obj_ext->funcs; + unsigned int nr_funcs = obj_ext->nr_funcs; + struct klp_func *funcs = objs[i].funcs; + struct klp_object *obj = objs + i; + + funcs = kzalloc(sizeof(struct klp_func) * (nr_funcs + 1), GFP_KERNEL); + if (!funcs) { + ret = -ENOMEM; + for (int j = 0; j < i; j++) + kfree(objs[i].funcs); + goto err_free_objs; + } + + for (int j = 0; j < nr_funcs; j++) { + funcs[j].old_name = funcs_ext[j].old_name; + funcs[j].new_func = funcs_ext[j].new_func; + funcs[j].old_sympos = funcs_ext[j].sympos; + } + + obj->name = obj_ext->name; + obj->funcs = funcs; + + memcpy(&obj->callbacks, &obj_ext->callbacks, sizeof(struct klp_callbacks)); + } + + patch->mod = THIS_MODULE; + patch->objs = objs; + + /* TODO patch->states */ + +#ifdef KLP_NO_REPLACE + patch->replace = false; +#else + patch->replace = true; +#endif + + return klp_enable_patch(patch); + +err_free_objs: + kfree(objs); +err_free_patch: + kfree(patch); +err: + return ret; +} + +static void __exit livepatch_mod_exit(void) +{ + unsigned int nr_objs; + + nr_objs = __stop_klp_objects - __start_klp_objects; + + for (int i = 0; i < nr_objs; i++) + kfree(patch->objs[i].funcs); + + kfree(patch->objs); + kfree(patch); +} + +module_init(livepatch_mod_init); +module_exit(livepatch_mod_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); +MODULE_DESCRIPTION("Livepatch module"); diff --git a/scripts/livepatch/klp-build b/scripts/livepatch/klp-build new file mode 100755 index 000000000000..882272120c9e --- /dev/null +++ b/scripts/livepatch/klp-build @@ -0,0 +1,831 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Build a livepatch module + +# shellcheck disable=SC1090,SC2155 + +if (( BASH_VERSINFO[0] < 4 || \ + (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] < 4) )); then + echo "error: this script requires bash 4.4+" >&2 + exit 1 +fi + +set -o errexit +set -o errtrace +set -o pipefail +set -o nounset + +# Allow doing 'cmd | mapfile -t array' instead of 'mapfile -t array < <(cmd)'. +# This helps keep execution in pipes so pipefail+errexit can catch errors. +shopt -s lastpipe + +unset DEBUG_CLONE DIFF_CHECKSUM SKIP_CLEANUP XTRACE + +REPLACE=1 +SHORT_CIRCUIT=0 +JOBS="$(getconf _NPROCESSORS_ONLN)" +VERBOSE="-s" +shopt -o xtrace | grep -q 'on' && XTRACE=1 + +# Avoid removing the previous $TMP_DIR until args have been fully processed. +KEEP_TMP=1 + +SCRIPT="$(basename "$0")" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +FIX_PATCH_LINES="$SCRIPT_DIR/fix-patch-lines" + +SRC="$(pwd)" +OBJ="$(pwd)" + +CONFIG="$OBJ/.config" +TMP_DIR="$OBJ/klp-tmp" + +ORIG_DIR="$TMP_DIR/orig" +PATCHED_DIR="$TMP_DIR/patched" +DIFF_DIR="$TMP_DIR/diff" +KMOD_DIR="$TMP_DIR/kmod" + +STASH_DIR="$TMP_DIR/stash" +TIMESTAMP="$TMP_DIR/timestamp" +PATCH_TMP_DIR="$TMP_DIR/tmp" + +KLP_DIFF_LOG="$DIFF_DIR/diff.log" + +grep0() { + command grep "$@" || true +} + +status() { + echo "$*" +} + +warn() { + echo "error: $SCRIPT: $*" >&2 +} + +die() { + warn "$@" + exit 1 +} + +declare -a STASHED_FILES + +stash_file() { + local file="$1" + local rel_file="${file#"$SRC"/}" + + [[ ! -e "$file" ]] && die "no file to stash: $file" + + mkdir -p "$STASH_DIR/$(dirname "$rel_file")" + cp -f "$file" "$STASH_DIR/$rel_file" + + STASHED_FILES+=("$rel_file") +} + +restore_files() { + local file + + for file in "${STASHED_FILES[@]}"; do + mv -f "$STASH_DIR/$file" "$SRC/$file" || warn "can't restore file: $file" + done + + STASHED_FILES=() +} + +cleanup() { + set +o nounset + revert_patches "--recount" + restore_files + [[ "$KEEP_TMP" -eq 0 ]] && rm -rf "$TMP_DIR" + return 0 +} + +trap_err() { + warn "line ${BASH_LINENO[0]}: '$BASH_COMMAND'" +} + +trap cleanup EXIT INT TERM HUP +trap trap_err ERR + +__usage() { + cat <<EOF +Usage: $SCRIPT [OPTIONS] PATCH_FILE(s) +Generate a livepatch module. + +Options: + -f, --show-first-changed Show address of first changed instruction + -j, --jobs=<jobs> Build jobs to run simultaneously [default: $JOBS] + -o, --output=<file.ko> Output file [default: livepatch-<patch-name>.ko] + --no-replace Disable livepatch atomic replace + -v, --verbose Pass V=1 to kernel/module builds + +Advanced Options: + -d, --debug Show symbol/reloc cloning decisions + -S, --short-circuit=STEP Start at build step (requires prior --keep-tmp) + 1|orig Build original kernel (default) + 2|patched Build patched kernel + 3|diff Diff objects + 4|kmod Build patch module + -T, --keep-tmp Preserve tmp dir on exit + +EOF +} + +usage() { + __usage >&2 +} + +process_args() { + local keep_tmp=0 + local short + local long + local args + + short="hfj:o:vdS:T" + long="help,show-first-changed,jobs:,output:,no-replace,verbose,debug,short-circuit:,keep-tmp" + + args=$(getopt --options "$short" --longoptions "$long" -- "$@") || { + echo; usage; exit + } + eval set -- "$args" + + while true; do + case "$1" in + -h | --help) + usage + exit 0 + ;; + -f | --show-first-changed) + DIFF_CHECKSUM=1 + shift + ;; + -j | --jobs) + JOBS="$2" + shift 2 + ;; + -o | --output) + [[ "$2" != *.ko ]] && die "output filename should end with .ko" + OUTFILE="$2" + NAME="$(basename "$OUTFILE")" + NAME="${NAME%.ko}" + NAME="$(module_name_string "$NAME")" + shift 2 + ;; + --no-replace) + REPLACE=0 + shift + ;; + -v | --verbose) + VERBOSE="V=1" + shift + ;; + -d | --debug) + DEBUG_CLONE=1 + keep_tmp=1 + shift + ;; + -S | --short-circuit) + [[ ! -d "$TMP_DIR" ]] && die "--short-circuit requires preserved klp-tmp dir" + keep_tmp=1 + case "$2" in + 1 | orig) SHORT_CIRCUIT=1; ;; + 2 | patched) SHORT_CIRCUIT=2; ;; + 3 | diff) SHORT_CIRCUIT=3; ;; + 4 | mod) SHORT_CIRCUIT=4; ;; + *) die "invalid short-circuit step '$2'" ;; + esac + shift 2 + ;; + -T | --keep-tmp) + keep_tmp=1 + shift + ;; + --) + shift + break + ;; + *) + usage + exit 1 + ;; + esac + done + + if [[ $# -eq 0 ]]; then + usage + exit 1 + fi + + KEEP_TMP="$keep_tmp" + PATCHES=("$@") +} + +# temporarily disable xtrace for especially verbose code +xtrace_save() { + [[ -v XTRACE ]] && set +x + return 0 +} + +xtrace_restore() { + [[ -v XTRACE ]] && set -x + return 0 +} + +validate_config() { + xtrace_save "reading .config" + source "$CONFIG" || die "no .config file in $(dirname "$CONFIG")" + xtrace_restore + + [[ -v CONFIG_LIVEPATCH ]] || \ + die "CONFIG_LIVEPATCH not enabled" + + [[ -v CONFIG_KLP_BUILD ]] || \ + die "CONFIG_KLP_BUILD not enabled" + + [[ -v CONFIG_GCC_PLUGIN_LATENT_ENTROPY ]] && \ + die "kernel option 'CONFIG_GCC_PLUGIN_LATENT_ENTROPY' not supported" + + [[ -v CONFIG_GCC_PLUGIN_RANDSTRUCT ]] && \ + die "kernel option 'CONFIG_GCC_PLUGIN_RANDSTRUCT' not supported" + + return 0 +} + +# Only allow alphanumerics and '_' and '-' in the module name. Everything else +# is replaced with '-'. Also truncate to 55 chars so the full name + NUL +# terminator fits in the kernel's 56-byte module name array. +module_name_string() { + echo "${1//[^a-zA-Z0-9_-]/-}" | cut -c 1-55 +} + +# If the module name wasn't specified on the cmdline with --output, give it a +# name based on the patch name. +set_module_name() { + [[ -v NAME ]] && return 0 + + if [[ "${#PATCHES[@]}" -eq 1 ]]; then + NAME="$(basename "${PATCHES[0]}")" + NAME="${NAME%.*}" + else + NAME="patch" + fi + + NAME="livepatch-$NAME" + NAME="$(module_name_string "$NAME")" + + OUTFILE="$NAME.ko" +} + +# Hardcode the value printed by the localversion script to prevent patch +# application from appending it with '+' due to a dirty git working tree. +set_kernelversion() { + local file="$SRC/scripts/setlocalversion" + local localversion + + stash_file "$file" + + localversion="$(cd "$SRC" && make --no-print-directory kernelversion)" + localversion="$(cd "$SRC" && KERNELVERSION="$localversion" ./scripts/setlocalversion)" + [[ -z "$localversion" ]] && die "setlocalversion failed" + + sed -i "2i echo $localversion; exit 0" scripts/setlocalversion +} + +get_patch_files() { + local patch="$1" + + grep0 -E '^(--- |\+\+\+ )' "$patch" \ + | gawk '{print $2}' \ + | sed 's|^[^/]*/||' \ + | sort -u +} + +# Make sure git re-stats the changed files +git_refresh() { + local patch="$1" + local files=() + + [[ ! -e "$SRC/.git" ]] && return + + get_patch_files "$patch" | mapfile -t files + + ( + cd "$SRC" + git update-index -q --refresh -- "${files[@]}" + ) +} + +check_unsupported_patches() { + local patch + + for patch in "${PATCHES[@]}"; do + local files=() + + get_patch_files "$patch" | mapfile -t files + + for file in "${files[@]}"; do + case "$file" in + lib/*|*.S) + die "unsupported patch to $file" + ;; + esac + done + done +} + +apply_patch() { + local patch="$1" + shift + local extra_args=("$@") + + [[ ! -f "$patch" ]] && die "$patch doesn't exist" + + ( + cd "$SRC" + + # The sed strips the version signature from 'git format-patch', + # otherwise 'git apply --recount' warns. + sed -n '/^-- /q;p' "$patch" | + git apply "${extra_args[@]}" + ) + + APPLIED_PATCHES+=("$patch") +} + +revert_patch() { + local patch="$1" + shift + local extra_args=("$@") + local tmp=() + + ( + cd "$SRC" + + sed -n '/^-- /q;p' "$patch" | + git apply --reverse "${extra_args[@]}" + ) + git_refresh "$patch" + + for p in "${APPLIED_PATCHES[@]}"; do + [[ "$p" == "$patch" ]] && continue + tmp+=("$p") + done + + APPLIED_PATCHES=("${tmp[@]}") +} + +apply_patches() { + local patch + + for patch in "${PATCHES[@]}"; do + apply_patch "$patch" + done +} + +revert_patches() { + local extra_args=("$@") + local patches=("${APPLIED_PATCHES[@]}") + + for (( i=${#patches[@]}-1 ; i>=0 ; i-- )) ; do + revert_patch "${patches[$i]}" "${extra_args[@]}" + done + + APPLIED_PATCHES=() +} + +validate_patches() { + check_unsupported_patches + apply_patches + revert_patches +} + +do_init() { + # We're not yet smart enough to handle anything other than in-tree + # builds in pwd. + [[ ! "$SRC" -ef "$SCRIPT_DIR/../.." ]] && die "please run from the kernel root directory" + [[ ! "$OBJ" -ef "$SCRIPT_DIR/../.." ]] && die "please run from the kernel root directory" + + (( SHORT_CIRCUIT <= 1 )) && rm -rf "$TMP_DIR" + mkdir -p "$TMP_DIR" + + APPLIED_PATCHES=() + + [[ -x "$FIX_PATCH_LINES" ]] || die "can't find fix-patch-lines" + + validate_config + set_module_name + set_kernelversion +} + +# Refresh the patch hunk headers, specifically the line numbers and counts. +refresh_patch() { + local patch="$1" + local tmpdir="$PATCH_TMP_DIR" + local files=() + + rm -rf "$tmpdir" + mkdir -p "$tmpdir/a" + mkdir -p "$tmpdir/b" + + # Get all source files affected by the patch + get_patch_files "$patch" | mapfile -t files + + # Copy orig source files to 'a' + ( cd "$SRC" && echo "${files[@]}" | xargs cp --parents --target-directory="$tmpdir/a" ) + + # Copy patched source files to 'b' + apply_patch "$patch" --recount + ( cd "$SRC" && echo "${files[@]}" | xargs cp --parents --target-directory="$tmpdir/b" ) + revert_patch "$patch" --recount + + # Diff 'a' and 'b' to make a clean patch + ( cd "$tmpdir" && git diff --no-index --no-prefix a b > "$patch" ) || true +} + +# Copy the patches to a temporary directory, fix their lines so as not to +# affect the __LINE__ macro for otherwise unchanged functions further down the +# file, and update $PATCHES to point to the fixed patches. +fix_patches() { + local idx + local i + + rm -f "$TMP_DIR"/*.patch + + idx=0001 + for i in "${!PATCHES[@]}"; do + local old_patch="${PATCHES[$i]}" + local tmp_patch="$TMP_DIR/tmp.patch" + local patch="${PATCHES[$i]}" + local new_patch + + new_patch="$TMP_DIR/$idx-fixed-$(basename "$patch")" + + cp -f "$old_patch" "$tmp_patch" + refresh_patch "$tmp_patch" + "$FIX_PATCH_LINES" "$tmp_patch" > "$new_patch" + refresh_patch "$new_patch" + + PATCHES[i]="$new_patch" + + rm -f "$tmp_patch" + idx=$(printf "%04d" $(( 10#$idx + 1 ))) + done +} + +clean_kernel() { + local cmd=() + + cmd=("make") + cmd+=("--silent") + cmd+=("-j$JOBS") + cmd+=("clean") + + ( + cd "$SRC" + "${cmd[@]}" + ) +} + +build_kernel() { + local log="$TMP_DIR/build.log" + local objtool_args=() + local cmd=() + + objtool_args=("--checksum") + + cmd=("make") + + # When a patch to a kernel module references a newly created unexported + # symbol which lives in vmlinux or another kernel module, the patched + # kernel build fails with the following error: + # + # ERROR: modpost: "klp_string" [fs/xfs/xfs.ko] undefined! + # + # The undefined symbols are working as designed in that case. They get + # resolved later when the livepatch module build link pulls all the + # disparate objects together into the same kernel module. + # + # It would be good to have a way to tell modpost to skip checking for + # undefined symbols altogether. For now, just convert the error to a + # warning with KBUILD_MODPOST_WARN, and grep out the warning to avoid + # confusing the user. + # + cmd+=("KBUILD_MODPOST_WARN=1") + + cmd+=("$VERBOSE") + cmd+=("-j$JOBS") + cmd+=("KCFLAGS=-ffunction-sections -fdata-sections") + cmd+=("OBJTOOL_ARGS=${objtool_args[*]}") + cmd+=("vmlinux") + cmd+=("modules") + + ( + cd "$SRC" + "${cmd[@]}" \ + 1> >(tee -a "$log") \ + 2> >(tee -a "$log" | grep0 -v "modpost.*undefined!" >&2) + ) +} + +find_objects() { + local opts=("$@") + + # Find root-level vmlinux.o and non-root-level .ko files, + # excluding klp-tmp/ and .git/ + find "$OBJ" \( -path "$TMP_DIR" -o -path "$OBJ/.git" -o -regex "$OBJ/[^/][^/]*\.ko" \) -prune -o \ + -type f "${opts[@]}" \ + \( -name "*.ko" -o -path "$OBJ/vmlinux.o" \) \ + -printf '%P\n' +} + +# Copy all .o archives to $ORIG_DIR +copy_orig_objects() { + local files=() + + rm -rf "$ORIG_DIR" + mkdir -p "$ORIG_DIR" + + find_objects | mapfile -t files + + xtrace_save "copying orig objects" + for _file in "${files[@]}"; do + local rel_file="${_file/.ko/.o}" + local file="$OBJ/$rel_file" + local file_dir="$(dirname "$file")" + local orig_file="$ORIG_DIR/$rel_file" + local orig_dir="$(dirname "$orig_file")" + local cmd_file="$file_dir/.$(basename "$file").cmd" + + [[ ! -f "$file" ]] && die "missing $(basename "$file") for $_file" + + mkdir -p "$orig_dir" + cp -f "$file" "$orig_dir" + [[ -e "$cmd_file" ]] && cp -f "$cmd_file" "$orig_dir" + done + xtrace_restore + + mv -f "$TMP_DIR/build.log" "$ORIG_DIR" + touch "$TIMESTAMP" +} + +# Copy all changed objects to $PATCHED_DIR +copy_patched_objects() { + local files=() + local opts=() + local found=0 + + rm -rf "$PATCHED_DIR" + mkdir -p "$PATCHED_DIR" + + # Note this doesn't work with some configs, thus the 'cmp' below. + opts=("-newer") + opts+=("$TIMESTAMP") + + find_objects "${opts[@]}" | mapfile -t files + + xtrace_save "copying changed objects" + for _file in "${files[@]}"; do + local rel_file="${_file/.ko/.o}" + local file="$OBJ/$rel_file" + local orig_file="$ORIG_DIR/$rel_file" + local patched_file="$PATCHED_DIR/$rel_file" + local patched_dir="$(dirname "$patched_file")" + + [[ ! -f "$file" ]] && die "missing $(basename "$file") for $_file" + + cmp -s "$orig_file" "$file" && continue + + mkdir -p "$patched_dir" + cp -f "$file" "$patched_dir" + found=1 + done + xtrace_restore + + (( found == 0 )) && die "no changes detected" + + mv -f "$TMP_DIR/build.log" "$PATCHED_DIR" +} + +# Diff changed objects, writing output object to $DIFF_DIR +diff_objects() { + local log="$KLP_DIFF_LOG" + local files=() + local opts=() + + rm -rf "$DIFF_DIR" + mkdir -p "$DIFF_DIR" + + find "$PATCHED_DIR" -type f -name "*.o" | mapfile -t files + [[ ${#files[@]} -eq 0 ]] && die "no changes detected" + + [[ -v DEBUG_CLONE ]] && opts=("--debug") + + # Diff all changed objects + for file in "${files[@]}"; do + local rel_file="${file#"$PATCHED_DIR"/}" + local orig_file="$rel_file" + local patched_file="$PATCHED_DIR/$rel_file" + local out_file="$DIFF_DIR/$rel_file" + local filter=() + local cmd=() + + mkdir -p "$(dirname "$out_file")" + + cmd=("$SRC/tools/objtool/objtool") + cmd+=("klp") + cmd+=("diff") + (( ${#opts[@]} > 0 )) && cmd+=("${opts[@]}") + cmd+=("$orig_file") + cmd+=("$patched_file") + cmd+=("$out_file") + + if [[ -v DIFF_CHECKSUM ]]; then + filter=("grep0") + filter+=("-Ev") + filter+=("DEBUG: .*checksum: ") + else + filter=("cat") + fi + + ( + cd "$ORIG_DIR" + "${cmd[@]}" \ + 1> >(tee -a "$log") \ + 2> >(tee -a "$log" | "${filter[@]}" >&2) || \ + die "objtool klp diff failed" + ) + done +} + +# For each changed object, run objtool with --debug-checksum to get the +# per-instruction checksums, and then diff those to find the first changed +# instruction for each function. +diff_checksums() { + local orig_log="$ORIG_DIR/checksum.log" + local patched_log="$PATCHED_DIR/checksum.log" + local -A funcs + local cmd=() + local line + local file + local func + + gawk '/\.o: changed function: / { + sub(/:$/, "", $1) + print $1, $NF + }' "$KLP_DIFF_LOG" | mapfile -t lines + + for line in "${lines[@]}"; do + read -r file func <<< "$line" + if [[ ! -v funcs["$file"] ]]; then + funcs["$file"]="$func" + else + funcs["$file"]+=" $func" + fi + done + + cmd=("$SRC/tools/objtool/objtool") + cmd+=("--checksum") + cmd+=("--link") + cmd+=("--dry-run") + + for file in "${!funcs[@]}"; do + local opt="--debug-checksum=${funcs[$file]// /,}" + + ( + cd "$ORIG_DIR" + "${cmd[@]}" "$opt" "$file" &> "$orig_log" || \ + ( cat "$orig_log" >&2; die "objtool --debug-checksum failed" ) + + cd "$PATCHED_DIR" + "${cmd[@]}" "$opt" "$file" &> "$patched_log" || \ + ( cat "$patched_log" >&2; die "objtool --debug-checksum failed" ) + ) + + for func in ${funcs[$file]}; do + diff <( grep0 -E "^DEBUG: .*checksum: $func " "$orig_log" | sed "s|$ORIG_DIR/||") \ + <( grep0 -E "^DEBUG: .*checksum: $func " "$patched_log" | sed "s|$PATCHED_DIR/||") \ + | gawk '/^< DEBUG: / { + gsub(/:/, "") + printf "%s: %s: %s\n", $3, $5, $6 + exit + }' || true + done + done +} + +# Build and post-process livepatch module in $KMOD_DIR +build_patch_module() { + local makefile="$KMOD_DIR/Kbuild" + local log="$KMOD_DIR/build.log" + local kmod_file + local cflags=() + local files=() + local cmd=() + + rm -rf "$KMOD_DIR" + mkdir -p "$KMOD_DIR" + + cp -f "$SRC/scripts/livepatch/init.c" "$KMOD_DIR" + + echo "obj-m := $NAME.o" > "$makefile" + echo -n "$NAME-y := init.o" >> "$makefile" + + find "$DIFF_DIR" -type f -name "*.o" | mapfile -t files + [[ ${#files[@]} -eq 0 ]] && die "no changes detected" + + for file in "${files[@]}"; do + local rel_file="${file#"$DIFF_DIR"/}" + local orig_file="$ORIG_DIR/$rel_file" + local orig_dir="$(dirname "$orig_file")" + local kmod_file="$KMOD_DIR/$rel_file" + local kmod_dir="$(dirname "$kmod_file")" + local cmd_file="$orig_dir/.$(basename "$file").cmd" + + mkdir -p "$kmod_dir" + cp -f "$file" "$kmod_dir" + [[ -e "$cmd_file" ]] && cp -f "$cmd_file" "$kmod_dir" + + # Tell kbuild this is a prebuilt object + cp -f "$file" "${kmod_file}_shipped" + + echo -n " $rel_file" >> "$makefile" + done + + echo >> "$makefile" + + cflags=("-ffunction-sections") + cflags+=("-fdata-sections") + [[ $REPLACE -eq 0 ]] && cflags+=("-DKLP_NO_REPLACE") + + cmd=("make") + cmd+=("$VERBOSE") + cmd+=("-j$JOBS") + cmd+=("--directory=.") + cmd+=("M=$KMOD_DIR") + cmd+=("KCFLAGS=${cflags[*]}") + + # Build a "normal" kernel module with init.c and the diffed objects + ( + cd "$SRC" + "${cmd[@]}" \ + 1> >(tee -a "$log") \ + 2> >(tee -a "$log" >&2) + ) + + kmod_file="$KMOD_DIR/$NAME.ko" + + # Save off the intermediate binary for debugging + cp -f "$kmod_file" "$kmod_file.orig" + + # Work around issue where slight .config change makes corrupt BTF + objcopy --remove-section=.BTF "$kmod_file" + + # Fix (and work around) linker wreckage for klp syms / relocs + "$SRC/tools/objtool/objtool" klp post-link "$kmod_file" || die "objtool klp post-link failed" + + cp -f "$kmod_file" "$OUTFILE" +} + + +################################################################################ + +process_args "$@" +do_init + +if (( SHORT_CIRCUIT <= 1 )); then + status "Validating patch(es)" + validate_patches + status "Building original kernel" + clean_kernel + build_kernel + status "Copying original object files" + copy_orig_objects +fi + +if (( SHORT_CIRCUIT <= 2 )); then + status "Fixing patch(es)" + fix_patches + apply_patches + status "Building patched kernel" + build_kernel + revert_patches + status "Copying patched object files" + copy_patched_objects +fi + +if (( SHORT_CIRCUIT <= 3 )); then + status "Diffing objects" + diff_objects + if [[ -v DIFF_CHECKSUM ]]; then + status "Finding first changed instructions" + diff_checksums + fi +fi + +if (( SHORT_CIRCUIT <= 4 )); then + status "Building patch module: $OUTFILE" + build_patch_module +fi + +status "SUCCESS" diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 47c8aa2a6939..755b842f1f9b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -606,6 +606,11 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname) strstarts(symname, "_savevr_") || strcmp(symname, ".TOC.") == 0) return 1; + + /* ignore linker-created section bounds variables */ + if (strstarts(symname, "__start_") || strstarts(symname, "__stop_")) + return 1; + /* Do not ignore this symbol */ return 0; } diff --git a/scripts/module.lds.S b/scripts/module.lds.S index ee79c41059f3..3037d5e5527c 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -34,16 +34,22 @@ SECTIONS { __patchable_function_entries : { *(__patchable_function_entries) } + __klp_funcs 0: ALIGN(8) { KEEP(*(__klp_funcs)) } + + __klp_objects 0: ALIGN(8) { + __start_klp_objects = .; + KEEP(*(__klp_objects)) + __stop_klp_objects = .; + } + #ifdef CONFIG_ARCH_USES_CFI_TRAPS - __kcfi_traps : { KEEP(*(.kcfi_traps)) } + __kcfi_traps : { KEEP(*(.kcfi_traps)) } #endif -#ifdef CONFIG_LTO_CLANG - /* - * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and - * -ffunction-sections, which increases the size of the final module. - * Merge the split sections in the final binary. - */ + .text : { + *(.text .text.[0-9a-zA-Z_]*) + } + .bss : { *(.bss .bss.[0-9a-zA-Z_]*) *(.bss..L*) @@ -58,7 +64,7 @@ SECTIONS { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } -#endif + MOD_SEPARATE_CODETAG_SECTIONS() } diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index d1ae5e92c615..e74868be513c 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -410,3 +410,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 391a586d0557..9d08d103f142 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -355,17 +355,17 @@ static void aafs_remove(struct dentry *dentry) if (!dentry || IS_ERR(dentry)) return; + /* ->d_parent is stable as rename is not supported */ dir = d_inode(dentry->d_parent); - inode_lock(dir); - if (simple_positive(dentry)) { + dentry = start_removing_dentry(dentry->d_parent, dentry); + if (!IS_ERR(dentry) && simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(dir, dentry); else simple_unlink(dir, dentry); d_delete(dentry); - dput(dentry); } - inode_unlock(dir); + end_removing(dentry); simple_release_fs(&aafs_mnt, &aafs_count); } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index b5d5333ab330..a63c46bb2d14 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -51,7 +51,7 @@ static struct key *get_user_register(struct user_namespace *user_ns) if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, - &init_cred, + kernel_cred(), KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 0bade2c5aa1d..cee2b6f22c83 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1296,7 +1296,7 @@ static void hook_sb_delete(struct super_block *const sb) * second call to iput() for the same Landlock object. Also * checks I_NEW because such inode cannot be tied to an object. */ - if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } @@ -1335,11 +1335,10 @@ static void hook_sb_delete(struct super_block *const sb) * At this point, we own the ihold() reference that was * originally set up by get_inode_object() and the * __iget() reference that we just set in this loop - * walk. Therefore the following call to iput() will - * not sleep nor drop the inode because there is now at - * least two references to it. + * walk. Therefore there are at least two references + * on the inode. */ - iput(inode); + iput_not_last(inode); } else { spin_unlock(&object->lock); rcu_read_unlock(); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dfc22da42f30..e713291db873 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -210,12 +210,12 @@ static int selinux_lsm_notifier_avc_callback(u32 event) */ static void cred_init_security(void) { - struct task_security_struct *tsec; + struct cred_security_struct *crsec; /* NOTE: the lsm framework zeros out the buffer on allocation */ - tsec = selinux_cred(unrcu_pointer(current->real_cred)); - tsec->osid = tsec->sid = tsec->avdcache.sid = SECINITSID_KERNEL; + crsec = selinux_cred(unrcu_pointer(current->real_cred)); + crsec->osid = crsec->sid = SECINITSID_KERNEL; } /* @@ -223,10 +223,10 @@ static void cred_init_security(void) */ static inline u32 cred_sid(const struct cred *cred) { - const struct task_security_struct *tsec; + const struct cred_security_struct *crsec; - tsec = selinux_cred(cred); - return tsec->sid; + crsec = selinux_cred(cred); + return crsec->sid; } static void __ad_net_init(struct common_audit_data *ad, @@ -437,15 +437,15 @@ static int may_context_mount_sb_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct task_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *crsec = selinux_cred(cred); int rc; - rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; - rc = avc_has_perm(tsec->sid, sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELTO, NULL); return rc; } @@ -454,9 +454,9 @@ static int may_context_mount_inode_relabel(u32 sid, struct superblock_security_struct *sbsec, const struct cred *cred) { - const struct task_security_struct *tsec = selinux_cred(cred); + const struct cred_security_struct *crsec = selinux_cred(cred); int rc; - rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, + rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM, FILESYSTEM__RELABELFROM, NULL); if (rc) return rc; @@ -1788,7 +1788,7 @@ out: * Determine the label for an inode that might be unioned. */ static int -selinux_determine_inode_label(const struct task_security_struct *tsec, +selinux_determine_inode_label(const struct cred_security_struct *crsec, struct inode *dir, const struct qstr *name, u16 tclass, u32 *_new_isid) @@ -1800,11 +1800,11 @@ selinux_determine_inode_label(const struct task_security_struct *tsec, (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) { *_new_isid = sbsec->mntpoint_sid; } else if ((sbsec->flags & SBLABEL_MNT) && - tsec->create_sid) { - *_new_isid = tsec->create_sid; + crsec->create_sid) { + *_new_isid = crsec->create_sid; } else { const struct inode_security_struct *dsec = inode_security(dir); - return security_transition_sid(tsec->sid, + return security_transition_sid(crsec->sid, dsec->sid, tclass, name, _new_isid); } @@ -1817,7 +1817,7 @@ static int may_create(struct inode *dir, struct dentry *dentry, u16 tclass) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *dsec; struct superblock_security_struct *sbsec; u32 sid, newsid; @@ -1827,7 +1827,7 @@ static int may_create(struct inode *dir, dsec = inode_security(dir); sbsec = selinux_superblock(dir->i_sb); - sid = tsec->sid; + sid = crsec->sid; ad.type = LSM_AUDIT_DATA_DENTRY; ad.u.dentry = dentry; @@ -1838,7 +1838,7 @@ static int may_create(struct inode *dir, if (rc) return rc; - rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass, + rc = selinux_determine_inode_label(crsec, dir, &dentry->d_name, tclass, &newsid); if (rc) return rc; @@ -2251,8 +2251,8 @@ static u32 ptrace_parent_sid(void) } static int check_nnp_nosuid(const struct linux_binprm *bprm, - const struct task_security_struct *old_tsec, - const struct task_security_struct *new_tsec) + const struct cred_security_struct *old_crsec, + const struct cred_security_struct *new_crsec) { int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); @@ -2262,7 +2262,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ - if (new_tsec->sid == old_tsec->sid) + if (new_crsec->sid == old_crsec->sid) return 0; /* No change in credentials */ /* @@ -2277,7 +2277,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, av |= PROCESS2__NNP_TRANSITION; if (nosuid) av |= PROCESS2__NOSUID_TRANSITION; - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS2, av, NULL); if (!rc) return 0; @@ -2288,8 +2288,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, * i.e. SIDs that are guaranteed to only be allowed a subset * of the permissions of the current SID. */ - rc = security_bounded_transition(old_tsec->sid, - new_tsec->sid); + rc = security_bounded_transition(old_crsec->sid, + new_crsec->sid); if (!rc) return 0; @@ -2305,8 +2305,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) { - const struct task_security_struct *old_tsec; - struct task_security_struct *new_tsec; + const struct cred_security_struct *old_crsec; + struct cred_security_struct *new_crsec; struct inode_security_struct *isec; struct common_audit_data ad; struct inode *inode = file_inode(bprm->file); @@ -2315,18 +2315,18 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) /* SELinux context only depends on initial program or script and not * the script interpreter */ - old_tsec = selinux_cred(current_cred()); - new_tsec = selinux_cred(bprm->cred); + old_crsec = selinux_cred(current_cred()); + new_crsec = selinux_cred(bprm->cred); isec = inode_security(inode); /* Default to the current task SID. */ - new_tsec->sid = old_tsec->sid; - new_tsec->osid = old_tsec->sid; + new_crsec->sid = old_crsec->sid; + new_crsec->osid = old_crsec->sid; /* Reset fs, key, and sock SIDs on execve. */ - new_tsec->create_sid = 0; - new_tsec->keycreate_sid = 0; - new_tsec->sockcreate_sid = 0; + new_crsec->create_sid = 0; + new_crsec->keycreate_sid = 0; + new_crsec->sockcreate_sid = 0; /* * Before policy is loaded, label any task outside kernel space @@ -2335,26 +2335,26 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL). */ if (!selinux_initialized()) { - new_tsec->sid = SECINITSID_INIT; + new_crsec->sid = SECINITSID_INIT; /* also clear the exec_sid just in case */ - new_tsec->exec_sid = 0; + new_crsec->exec_sid = 0; return 0; } - if (old_tsec->exec_sid) { - new_tsec->sid = old_tsec->exec_sid; + if (old_crsec->exec_sid) { + new_crsec->sid = old_crsec->exec_sid; /* Reset exec SID on execve. */ - new_tsec->exec_sid = 0; + new_crsec->exec_sid = 0; /* Fail on NNP or nosuid if not an allowed transition. */ - rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); + rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) return rc; } else { /* Check for a default transition on this program. */ - rc = security_transition_sid(old_tsec->sid, + rc = security_transition_sid(old_crsec->sid, isec->sid, SECCLASS_PROCESS, NULL, - &new_tsec->sid); + &new_crsec->sid); if (rc) return rc; @@ -2362,34 +2362,34 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) * Fallback to old SID on NNP or nosuid if not an allowed * transition. */ - rc = check_nnp_nosuid(bprm, old_tsec, new_tsec); + rc = check_nnp_nosuid(bprm, old_crsec, new_crsec); if (rc) - new_tsec->sid = old_tsec->sid; + new_crsec->sid = old_crsec->sid; } ad.type = LSM_AUDIT_DATA_FILE; ad.u.file = bprm->file; - if (new_tsec->sid == old_tsec->sid) { - rc = avc_has_perm(old_tsec->sid, isec->sid, + if (new_crsec->sid == old_crsec->sid) { + rc = avc_has_perm(old_crsec->sid, isec->sid, SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad); if (rc) return rc; } else { /* Check permissions for the transition. */ - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__TRANSITION, &ad); if (rc) return rc; - rc = avc_has_perm(new_tsec->sid, isec->sid, + rc = avc_has_perm(new_crsec->sid, isec->sid, SECCLASS_FILE, FILE__ENTRYPOINT, &ad); if (rc) return rc; /* Check for shared state */ if (bprm->unsafe & LSM_UNSAFE_SHARE) { - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__SHARE, NULL); if (rc) @@ -2401,7 +2401,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) if (bprm->unsafe & LSM_UNSAFE_PTRACE) { u32 ptsid = ptrace_parent_sid(); if (ptsid != 0) { - rc = avc_has_perm(ptsid, new_tsec->sid, + rc = avc_has_perm(ptsid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__PTRACE, NULL); if (rc) @@ -2415,7 +2415,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm) /* Enable secure mode for SIDs transitions unless the noatsecure permission is granted between the two SIDs, i.e. ahp returns 0. */ - rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + rc = avc_has_perm(old_crsec->sid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__NOATSECURE, NULL); bprm->secureexec |= !!rc; @@ -2483,12 +2483,12 @@ static inline void flush_unauthorized_files(const struct cred *cred, */ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) { - struct task_security_struct *new_tsec; + struct cred_security_struct *new_crsec; struct rlimit *rlim, *initrlim; int rc, i; - new_tsec = selinux_cred(bprm->cred); - if (new_tsec->sid == new_tsec->osid) + new_crsec = selinux_cred(bprm->cred); + if (new_crsec->sid == new_crsec->osid) return; /* Close files for which the new task SID is not authorized. */ @@ -2507,7 +2507,7 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) * higher than the default soft limit for cases where the default is * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK. */ - rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS, + rc = avc_has_perm(new_crsec->osid, new_crsec->sid, SECCLASS_PROCESS, PROCESS__RLIMITINH, NULL); if (rc) { /* protect against do_prlimit() */ @@ -2529,12 +2529,12 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm) */ static void selinux_bprm_committed_creds(const struct linux_binprm *bprm) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 osid, sid; int rc; - osid = tsec->osid; - sid = tsec->sid; + osid = crsec->osid; + sid = crsec->sid; if (sid == osid) return; @@ -2911,7 +2911,7 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, { u32 newsid; int rc; - struct task_security_struct *tsec; + struct cred_security_struct *crsec; rc = selinux_determine_inode_label(selinux_cred(old), d_inode(dentry->d_parent), name, @@ -2920,8 +2920,8 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, if (rc) return rc; - tsec = selinux_cred(new); - tsec->create_sid = newsid; + crsec = selinux_cred(new); + crsec->create_sid = newsid; return 0; } @@ -2929,7 +2929,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct superblock_security_struct *sbsec; struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count); u32 newsid, clen; @@ -2939,9 +2939,9 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir, sbsec = selinux_superblock(dir->i_sb); - newsid = tsec->create_sid; + newsid = crsec->create_sid; newsclass = inode_mode_to_security_class(inode->i_mode); - rc = selinux_determine_inode_label(tsec, dir, qstr, newsclass, &newsid); + rc = selinux_determine_inode_label(crsec, dir, qstr, newsclass, &newsid); if (rc) return rc; @@ -3113,7 +3113,7 @@ static noinline int audit_inode_permission(struct inode *inode, static inline void task_avdcache_reset(struct task_security_struct *tsec) { memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir)); - tsec->avdcache.sid = tsec->sid; + tsec->avdcache.sid = current_sid(); tsec->avdcache.seqno = avc_policy_seqno(); tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1; } @@ -3137,7 +3137,7 @@ static inline int task_avdcache_search(struct task_security_struct *tsec, if (isec->sclass != SECCLASS_DIR) return -ENOENT; - if (unlikely(tsec->sid != tsec->avdcache.sid || + if (unlikely(current_sid() != tsec->avdcache.sid || tsec->avdcache.seqno != avc_policy_seqno())) { task_avdcache_reset(tsec); return -ENOENT; @@ -3201,6 +3201,7 @@ static int selinux_inode_permission(struct inode *inode, int requested) { int mask; u32 perms; + u32 sid = current_sid(); struct task_security_struct *tsec; struct inode_security_struct *isec; struct avdc_entry *avdc; @@ -3213,8 +3214,8 @@ static int selinux_inode_permission(struct inode *inode, int requested) if (!mask) return 0; - tsec = selinux_cred(current_cred()); - if (task_avdcache_permnoaudit(tsec)) + tsec = selinux_task(current); + if (task_avdcache_permnoaudit(tsec, sid)) return 0; isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK); @@ -3234,7 +3235,7 @@ static int selinux_inode_permission(struct inode *inode, int requested) struct av_decision avd; /* Cache miss. */ - rc = avc_has_perm_noaudit(tsec->sid, isec->sid, isec->sclass, + rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0, &avd); audited = avc_audit_required(perms, &avd, rc, (requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0, @@ -3285,9 +3286,9 @@ static int selinux_inode_getattr(const struct path *path) { struct task_security_struct *tsec; - tsec = selinux_cred(current_cred()); + tsec = selinux_task(current); - if (task_avdcache_permnoaudit(tsec)) + if (task_avdcache_permnoaudit(tsec, current_sid())) return 0; return path_has_perm(current_cred(), path, FILE__GETATTR); @@ -3659,7 +3660,7 @@ static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) static int selinux_inode_copy_up(struct dentry *src, struct cred **new) { struct lsm_prop prop; - struct task_security_struct *tsec; + struct cred_security_struct *crsec; struct cred *new_creds = *new; if (new_creds == NULL) { @@ -3668,10 +3669,10 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new) return -ENOMEM; } - tsec = selinux_cred(new_creds); + crsec = selinux_cred(new_creds); /* Get label from overlay inode and set it in create_sid */ selinux_inode_getlsmprop(d_inode(src), &prop); - tsec->create_sid = prop.selinux.secid; + crsec->create_sid = prop.selinux.secid; *new = new_creds; return 0; } @@ -3697,7 +3698,7 @@ static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name) static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; @@ -3725,8 +3726,8 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, if (rc) return rc; - if (tsec->create_sid) { - newsid = tsec->create_sid; + if (crsec->create_sid) { + newsid = crsec->create_sid; } else { u16 secclass = inode_mode_to_security_class(kn->mode); const char *kn_name; @@ -3737,7 +3738,7 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, q.name = kn_name; q.hash_len = hashlen_string(kn_dir, kn_name); - rc = security_transition_sid(tsec->sid, + rc = security_transition_sid(crsec->sid, parent_sid, secclass, &q, &newsid); if (rc) @@ -4151,7 +4152,10 @@ static int selinux_task_alloc(struct task_struct *task, u64 clone_flags) { u32 sid = current_sid(); + struct task_security_struct *old_tsec = selinux_task(current); + struct task_security_struct *new_tsec = selinux_task(task); + *new_tsec = *old_tsec; return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL); } @@ -4161,10 +4165,10 @@ static int selinux_task_alloc(struct task_struct *task, static int selinux_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { - const struct task_security_struct *old_tsec = selinux_cred(old); - struct task_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_crsec = selinux_cred(old); + struct cred_security_struct *crsec = selinux_cred(new); - *tsec = *old_tsec; + *crsec = *old_crsec; return 0; } @@ -4173,10 +4177,10 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old, */ static void selinux_cred_transfer(struct cred *new, const struct cred *old) { - const struct task_security_struct *old_tsec = selinux_cred(old); - struct task_security_struct *tsec = selinux_cred(new); + const struct cred_security_struct *old_crsec = selinux_cred(old); + struct cred_security_struct *crsec = selinux_cred(new); - *tsec = *old_tsec; + *crsec = *old_crsec; } static void selinux_cred_getsecid(const struct cred *c, u32 *secid) @@ -4195,7 +4199,7 @@ static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop) */ static int selinux_kernel_act_as(struct cred *new, u32 secid) { - struct task_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4204,10 +4208,10 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) KERNEL_SERVICE__USE_AS_OVERRIDE, NULL); if (ret == 0) { - tsec->sid = secid; - tsec->create_sid = 0; - tsec->keycreate_sid = 0; - tsec->sockcreate_sid = 0; + crsec->sid = secid; + crsec->create_sid = 0; + crsec->keycreate_sid = 0; + crsec->sockcreate_sid = 0; } return ret; } @@ -4219,7 +4223,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid) static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) { struct inode_security_struct *isec = inode_security(inode); - struct task_security_struct *tsec = selinux_cred(new); + struct cred_security_struct *crsec = selinux_cred(new); u32 sid = current_sid(); int ret; @@ -4229,7 +4233,7 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode) NULL); if (ret == 0) - tsec->create_sid = isec->sid; + crsec->create_sid = isec->sid; return ret; } @@ -4744,15 +4748,15 @@ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid) /* socket security operations */ -static int socket_sockcreate_sid(const struct task_security_struct *tsec, +static int socket_sockcreate_sid(const struct cred_security_struct *crsec, u16 secclass, u32 *socksid) { - if (tsec->sockcreate_sid > SECSID_NULL) { - *socksid = tsec->sockcreate_sid; + if (crsec->sockcreate_sid > SECSID_NULL) { + *socksid = crsec->sockcreate_sid; return 0; } - return security_transition_sid(tsec->sid, tsec->sid, + return security_transition_sid(crsec->sid, crsec->sid, secclass, NULL, socksid); } @@ -4797,7 +4801,7 @@ static int sock_has_perm(struct sock *sk, u32 perms) static int selinux_socket_create(int family, int type, int protocol, int kern) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); u32 newsid; u16 secclass; int rc; @@ -4806,17 +4810,17 @@ static int selinux_socket_create(int family, int type, return 0; secclass = socket_type_to_security_class(family, type, protocol); - rc = socket_sockcreate_sid(tsec, secclass, &newsid); + rc = socket_sockcreate_sid(crsec, secclass, &newsid); if (rc) return rc; - return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL); + return avc_has_perm(crsec->sid, newsid, secclass, SOCKET__CREATE, NULL); } static int selinux_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock)); struct sk_security_struct *sksec; u16 sclass = socket_type_to_security_class(family, type, protocol); @@ -4824,7 +4828,7 @@ static int selinux_socket_post_create(struct socket *sock, int family, int err = 0; if (!kern) { - err = socket_sockcreate_sid(tsec, sclass, &sid); + err = socket_sockcreate_sid(crsec, sclass, &sid); if (err) return err; } @@ -6526,37 +6530,37 @@ static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode) static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p, char **value) { - const struct task_security_struct *tsec; + const struct cred_security_struct *crsec; int error; u32 sid; u32 len; rcu_read_lock(); - tsec = selinux_cred(__task_cred(p)); + crsec = selinux_cred(__task_cred(p)); if (p != current) { - error = avc_has_perm(current_sid(), tsec->sid, + error = avc_has_perm(current_sid(), crsec->sid, SECCLASS_PROCESS, PROCESS__GETATTR, NULL); if (error) goto err_unlock; } switch (attr) { case LSM_ATTR_CURRENT: - sid = tsec->sid; + sid = crsec->sid; break; case LSM_ATTR_PREV: - sid = tsec->osid; + sid = crsec->osid; break; case LSM_ATTR_EXEC: - sid = tsec->exec_sid; + sid = crsec->exec_sid; break; case LSM_ATTR_FSCREATE: - sid = tsec->create_sid; + sid = crsec->create_sid; break; case LSM_ATTR_KEYCREATE: - sid = tsec->keycreate_sid; + sid = crsec->keycreate_sid; break; case LSM_ATTR_SOCKCREATE: - sid = tsec->sockcreate_sid; + sid = crsec->sockcreate_sid; break; default: error = -EOPNOTSUPP; @@ -6581,7 +6585,7 @@ err_unlock: static int selinux_lsm_setattr(u64 attr, void *value, size_t size) { - struct task_security_struct *tsec; + struct cred_security_struct *crsec; struct cred *new; u32 mysid = current_sid(), sid = 0, ptsid; int error; @@ -6667,11 +6671,11 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) operation. See selinux_bprm_creds_for_exec for the execve checks and may_create for the file creation checks. The operation will then fail if the context is not permitted. */ - tsec = selinux_cred(new); + crsec = selinux_cred(new); if (attr == LSM_ATTR_EXEC) { - tsec->exec_sid = sid; + crsec->exec_sid = sid; } else if (attr == LSM_ATTR_FSCREATE) { - tsec->create_sid = sid; + crsec->create_sid = sid; } else if (attr == LSM_ATTR_KEYCREATE) { if (sid) { error = avc_has_perm(mysid, sid, @@ -6679,22 +6683,22 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) if (error) goto abort_change; } - tsec->keycreate_sid = sid; + crsec->keycreate_sid = sid; } else if (attr == LSM_ATTR_SOCKCREATE) { - tsec->sockcreate_sid = sid; + crsec->sockcreate_sid = sid; } else if (attr == LSM_ATTR_CURRENT) { error = -EINVAL; if (sid == 0) goto abort_change; if (!current_is_single_threaded()) { - error = security_bounded_transition(tsec->sid, sid); + error = security_bounded_transition(crsec->sid, sid); if (error) goto abort_change; } /* Check permissions for the transition. */ - error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS, + error = avc_has_perm(crsec->sid, sid, SECCLASS_PROCESS, PROCESS__DYNTRANSITION, NULL); if (error) goto abort_change; @@ -6709,7 +6713,7 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size) goto abort_change; } - tsec->sid = sid; + crsec->sid = sid; } else { error = -EINVAL; goto abort_change; @@ -6876,14 +6880,14 @@ static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp) static int selinux_key_alloc(struct key *k, const struct cred *cred, unsigned long flags) { - const struct task_security_struct *tsec; + const struct cred_security_struct *crsec; struct key_security_struct *ksec = selinux_key(k); - tsec = selinux_cred(cred); - if (tsec->keycreate_sid) - ksec->sid = tsec->keycreate_sid; + crsec = selinux_cred(cred); + if (crsec->keycreate_sid) + ksec->sid = crsec->keycreate_sid; else - ksec->sid = tsec->sid; + ksec->sid = crsec->sid; return 0; } @@ -7137,7 +7141,8 @@ static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *att #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { - .lbs_cred = sizeof(struct task_security_struct), + .lbs_cred = sizeof(struct cred_security_struct), + .lbs_task = sizeof(struct task_security_struct), .lbs_file = sizeof(struct file_security_struct), .lbs_inode = sizeof(struct inode_security_struct), .lbs_ipc = sizeof(struct ipc_security_struct), diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 2d5139c6d45b..8fc3de5234ac 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -37,13 +37,16 @@ struct avdc_entry { bool permissive; /* AVC permissive flag */ }; -struct task_security_struct { +struct cred_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ +} __randomize_layout; + +struct task_security_struct { #define TSEC_AVDC_DIR_SIZE (1 << 2) struct { u32 sid; /* current SID for cached entries */ @@ -54,10 +57,11 @@ struct task_security_struct { } avdcache; } __randomize_layout; -static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec) +static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec, + u32 sid) { return (tsec->avdcache.permissive_neveraudit && - tsec->sid == tsec->avdcache.sid && + sid == tsec->avdcache.sid && tsec->avdcache.seqno == avc_policy_seqno()); } @@ -172,11 +176,17 @@ struct perf_event_security_struct { }; extern struct lsm_blob_sizes selinux_blob_sizes; -static inline struct task_security_struct *selinux_cred(const struct cred *cred) +static inline struct cred_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } +static inline struct task_security_struct * +selinux_task(const struct task_struct *task) +{ + return task->security + selinux_blob_sizes.lbs_task; +} + static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; @@ -207,9 +217,9 @@ selinux_ipc(const struct kern_ipc_perm *ipc) */ static inline u32 current_sid(void) { - const struct task_security_struct *tsec = selinux_cred(current_cred()); + const struct cred_security_struct *crsec = selinux_cred(current_cred()); - return tsec->sid; + return crsec->sid; } static inline struct superblock_security_struct * diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 232e087bce3e..404e08bf60ba 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -506,6 +506,7 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, { int ret = 0; struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir; + struct renamedata rd = {}; unsigned int bool_num = 0; char **bool_names = NULL; int *bool_values = NULL; @@ -539,9 +540,14 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, if (ret) goto out; - lock_rename(tmp_parent, fsi->sb->s_root); + rd.old_parent = tmp_parent; + rd.new_parent = fsi->sb->s_root; /* booleans */ + ret = start_renaming_two_dentries(&rd, tmp_bool_dir, fsi->bool_dir); + if (ret) + goto out; + d_exchange(tmp_bool_dir, fsi->bool_dir); swap(fsi->bool_num, bool_num); @@ -549,12 +555,17 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi, swap(fsi->bool_pending_values, bool_values); fsi->bool_dir = tmp_bool_dir; + end_renaming(&rd); /* classes */ + ret = start_renaming_two_dentries(&rd, tmp_class_dir, fsi->class_dir); + if (ret) + goto out; + d_exchange(tmp_class_dir, fsi->class_dir); fsi->class_dir = tmp_class_dir; - unlock_rename(tmp_parent, fsi->sb->s_root); + end_renaming(&rd); out: sel_remove_old_bool_data(bool_num, bool_names, bool_values); diff --git a/sound/hda/codecs/cirrus/cs420x.c b/sound/hda/codecs/cirrus/cs420x.c index 823220d5cada..13f5f1711fa4 100644 --- a/sound/hda/codecs/cirrus/cs420x.c +++ b/sound/hda/codecs/cirrus/cs420x.c @@ -585,6 +585,7 @@ static const struct hda_quirk cs4208_mac_fixup_tbl[] = { SND_PCI_QUIRK(0x106b, 0x6c00, "MacMini 7,1", CS4208_MACMINI), SND_PCI_QUIRK(0x106b, 0x7100, "MacBookAir 6,1", CS4208_MBA6), SND_PCI_QUIRK(0x106b, 0x7200, "MacBookAir 6,2", CS4208_MBA6), + SND_PCI_QUIRK(0x106b, 0x7800, "MacPro 6,1", CS4208_MACMINI), SND_PCI_QUIRK(0x106b, 0x7b00, "MacBookPro 12,1", CS4208_MBP11), {} /* terminator */ }; diff --git a/sound/hda/codecs/hdmi/nvhdmi-mcp.c b/sound/hda/codecs/hdmi/nvhdmi-mcp.c index 8fd8d76fa72f..1c5fdfe872f2 100644 --- a/sound/hda/codecs/hdmi/nvhdmi-mcp.c +++ b/sound/hda/codecs/hdmi/nvhdmi-mcp.c @@ -350,8 +350,8 @@ static int nvhdmi_mcp_probe(struct hda_codec *codec, static const struct hda_codec_ops nvhdmi_mcp_codec_ops = { .probe = nvhdmi_mcp_probe, .remove = snd_hda_hdmi_simple_remove, - .build_controls = nvhdmi_mcp_build_pcms, - .build_pcms = nvhdmi_mcp_build_controls, + .build_pcms = nvhdmi_mcp_build_pcms, + .build_controls = nvhdmi_mcp_build_controls, .init = nvhdmi_mcp_init, .unsol_event = snd_hda_hdmi_simple_unsol_event, }; diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 4aec5067c59d..b45fcc9a3785 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6525,6 +6525,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8a4f, "HP Victus 15-fa0xxx (MB 8A4F)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8a6e, "HP EDNA 360", ALC287_FIXUP_CS35L41_I2C_4), SND_PCI_QUIRK(0x103c, 0x8a74, "HP ProBook 440 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8a75, "HP ProBook 450 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a78, "HP Dev One", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x8aa0, "HP ProBook 440 G9 (MB 8A9E)", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED), @@ -6572,6 +6573,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8bc8, "HP Victus 15-fa1xxx", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8bcd, "HP Omen 16-xd0xxx", ALC245_FIXUP_HP_MUTE_LED_V1_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8bd4, "HP Victus 16-s0xxx (MB 8BD4)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), + SND_PCI_QUIRK(0x103c, 0x8bd6, "HP Pavilion Aero Laptop 13z-be200", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8bdd, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8bde, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8bdf, "HP Envy 15", ALC287_FIXUP_CS35L41_I2C_2), @@ -6694,6 +6696,15 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8e60, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8ed5, "HP Merino13X", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed6, "HP Merino13", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed7, "HP Merino14", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed8, "HP Merino16", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8ed9, "HP Merino14W", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8eda, "HP Merino16W", ALC245_FIXUP_TAS2781_SPI_2), + SND_PCI_QUIRK(0x103c, 0x8f40, "HP Lampas14", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f41, "HP Lampas16", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x103c, 0x8f42, "HP LampasW14", ALC287_FIXUP_TXNW2781_I2C), SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1034, "ASUS GU605C", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c index de56e83d8e10..bb02945793f0 100644 --- a/sound/pci/au88x0/au88x0.c +++ b/sound/pci/au88x0/au88x0.c @@ -280,11 +280,11 @@ __snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) // (5) err = pci_read_config_word(pci, PCI_DEVICE_ID, &chip->device); - if (err < 0) - return err; + if (err) + return pcibios_err_to_errno(err); err = pci_read_config_word(pci, PCI_VENDOR_ID, &chip->vendor); - if (err < 0) - return err; + if (err) + return pcibios_err_to_errno(err); chip->rev = pci->revision; #ifdef CHIP_AU8830 if ((chip->rev) != 0xfe && (chip->rev) != 0xfa) { diff --git a/sound/soc/codecs/cs4271.c b/sound/soc/codecs/cs4271.c index 6a3cca3d26c7..ead447a5da7f 100644 --- a/sound/soc/codecs/cs4271.c +++ b/sound/soc/codecs/cs4271.c @@ -581,17 +581,17 @@ static int cs4271_component_probe(struct snd_soc_component *component) ret = regcache_sync(cs4271->regmap); if (ret < 0) - return ret; + goto err_disable_regulator; ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2, CS4271_MODE2_PDN | CS4271_MODE2_CPEN, CS4271_MODE2_PDN | CS4271_MODE2_CPEN); if (ret < 0) - return ret; + goto err_disable_regulator; ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2, CS4271_MODE2_PDN, 0); if (ret < 0) - return ret; + goto err_disable_regulator; /* Power-up sequence requires 85 uS */ udelay(85); @@ -601,6 +601,10 @@ static int cs4271_component_probe(struct snd_soc_component *component) CS4271_MODE2_MUTECAEQUB); return 0; + +err_disable_regulator: + regulator_bulk_disable(ARRAY_SIZE(cs4271->supplies), cs4271->supplies); + return ret; } static void cs4271_component_remove(struct snd_soc_component *component) diff --git a/sound/soc/codecs/da7213.c b/sound/soc/codecs/da7213.c index ae89260ca215..3420011da444 100644 --- a/sound/soc/codecs/da7213.c +++ b/sound/soc/codecs/da7213.c @@ -2124,11 +2124,50 @@ static int da7213_probe(struct snd_soc_component *component) return 0; } +static int da7213_runtime_suspend(struct device *dev) +{ + struct da7213_priv *da7213 = dev_get_drvdata(dev); + + regcache_cache_only(da7213->regmap, true); + regcache_mark_dirty(da7213->regmap); + regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies); + + return 0; +} + +static int da7213_runtime_resume(struct device *dev) +{ + struct da7213_priv *da7213 = dev_get_drvdata(dev); + int ret; + + ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies); + if (ret < 0) + return ret; + regcache_cache_only(da7213->regmap, false); + return regcache_sync(da7213->regmap); +} + +static int da7213_suspend(struct snd_soc_component *component) +{ + struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component); + + return da7213_runtime_suspend(da7213->dev); +} + +static int da7213_resume(struct snd_soc_component *component) +{ + struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component); + + return da7213_runtime_resume(da7213->dev); +} + static const struct snd_soc_component_driver soc_component_dev_da7213 = { .probe = da7213_probe, .set_bias_level = da7213_set_bias_level, .controls = da7213_snd_controls, .num_controls = ARRAY_SIZE(da7213_snd_controls), + .suspend = da7213_suspend, + .resume = da7213_resume, .dapm_widgets = da7213_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(da7213_dapm_widgets), .dapm_routes = da7213_audio_map, @@ -2175,6 +2214,8 @@ static int da7213_i2c_probe(struct i2c_client *i2c) if (!da7213->fin_min_rate) return -EINVAL; + da7213->dev = &i2c->dev; + i2c_set_clientdata(i2c, da7213); /* Get required supplies */ @@ -2224,31 +2265,9 @@ static void da7213_i2c_remove(struct i2c_client *i2c) pm_runtime_disable(&i2c->dev); } -static int da7213_runtime_suspend(struct device *dev) -{ - struct da7213_priv *da7213 = dev_get_drvdata(dev); - - regcache_cache_only(da7213->regmap, true); - regcache_mark_dirty(da7213->regmap); - regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies); - - return 0; -} - -static int da7213_runtime_resume(struct device *dev) -{ - struct da7213_priv *da7213 = dev_get_drvdata(dev); - int ret; - - ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies); - if (ret < 0) - return ret; - regcache_cache_only(da7213->regmap, false); - return regcache_sync(da7213->regmap); -} - -static DEFINE_RUNTIME_DEV_PM_OPS(da7213_pm, da7213_runtime_suspend, - da7213_runtime_resume, NULL); +static const struct dev_pm_ops da7213_pm = { + RUNTIME_PM_OPS(da7213_runtime_suspend, da7213_runtime_resume, NULL) +}; static const struct i2c_device_id da7213_i2c_id[] = { { "da7213" }, diff --git a/sound/soc/codecs/da7213.h b/sound/soc/codecs/da7213.h index b9ab791d6b88..29cbf0eb6124 100644 --- a/sound/soc/codecs/da7213.h +++ b/sound/soc/codecs/da7213.h @@ -595,6 +595,7 @@ enum da7213_supplies { /* Codec private data */ struct da7213_priv { struct regmap *regmap; + struct device *dev; struct mutex ctrl_lock; struct regulator_bulk_data supplies[DA7213_NUM_SUPPLIES]; struct clk *mclk; diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c index 2e1b77973a3e..92c177b82a02 100644 --- a/sound/soc/codecs/lpass-va-macro.c +++ b/sound/soc/codecs/lpass-va-macro.c @@ -1638,7 +1638,7 @@ static int va_macro_probe(struct platform_device *pdev) if (ret) goto err_clkout; - va->fsgen = clk_hw_get_clk(&va->hw, "fsgen"); + va->fsgen = devm_clk_hw_get_clk(dev, &va->hw, "fsgen"); if (IS_ERR(va->fsgen)) { ret = PTR_ERR(va->fsgen); goto err_clkout; diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index ba880b5de7e8..8f37aa00e62e 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -1957,7 +1957,8 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv) { struct i2c_client *client = (struct i2c_client *)tas_priv->client; unsigned int dev_addrs[TASDEVICE_MAX_CHANNELS]; - int i, ndev = 0; + int ndev = 0; + int i, rc; if (tas_priv->isacpi) { ndev = device_property_read_u32_array(&client->dev, @@ -1968,8 +1969,12 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv) } else { ndev = (ndev < ARRAY_SIZE(dev_addrs)) ? ndev : ARRAY_SIZE(dev_addrs); - ndev = device_property_read_u32_array(&client->dev, + rc = device_property_read_u32_array(&client->dev, "ti,audio-slots", dev_addrs, ndev); + if (rc != 0) { + ndev = 1; + dev_addrs[0] = client->addr; + } } tas_priv->irq = diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index 1fb4227b711e..e273b80d033e 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -762,10 +762,17 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) goto out; } - mutex_lock(&tas_dev->pde_lock); img_sz = fmw->size; buf = fmw->data; offset += FW_DL_OFFSET; + if (offset >= (img_sz - FW_FL_HDR)) { + dev_err(tas_dev->dev, + "firmware is too small"); + ret = -EINVAL; + goto out; + } + + mutex_lock(&tas_dev->pde_lock); while (offset < (img_sz - FW_FL_HDR)) { memset(&hdr, 0, sizeof(hdr)); offset += read_header(&buf[offset], &hdr); @@ -776,6 +783,14 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) /* size also includes the header */ file_blk_size = hdr.length - FW_FL_HDR; + /* make sure that enough data is there */ + if (offset + file_blk_size > img_sz) { + ret = -EINVAL; + dev_err(tas_dev->dev, + "corrupt firmware file"); + break; + } + switch (hdr.file_id) { case 0: ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral, @@ -808,7 +823,8 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context) break; } mutex_unlock(&tas_dev->pde_lock); - tas2783_update_calibdata(tas_dev); + if (!ret) + tas2783_update_calibdata(tas_dev); out: if (!ret) diff --git a/sound/soc/renesas/rcar/ssiu.c b/sound/soc/renesas/rcar/ssiu.c index faf351126d57..244fb833292a 100644 --- a/sound/soc/renesas/rcar/ssiu.c +++ b/sound/soc/renesas/rcar/ssiu.c @@ -509,7 +509,7 @@ void rsnd_parse_connect_ssiu(struct rsnd_dai *rdai, int rsnd_ssiu_probe(struct rsnd_priv *priv) { struct device *dev = rsnd_priv_to_dev(priv); - struct device_node *node; + struct device_node *node __free(device_node) = rsnd_ssiu_of_node(priv); struct rsnd_ssiu *ssiu; struct rsnd_mod_ops *ops; const int *list = NULL; @@ -522,7 +522,6 @@ int rsnd_ssiu_probe(struct rsnd_priv *priv) * see * rsnd_ssiu_bufsif_to_id() */ - node = rsnd_ssiu_of_node(priv); if (node) nr = rsnd_node_count(priv, node, SSIU_NAME); else diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c index 13f68f7b6dd6..0ccb6775f4de 100644 --- a/sound/soc/sdca/sdca_functions.c +++ b/sound/soc/sdca/sdca_functions.c @@ -894,7 +894,8 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti return ret; } - control->values = devm_kzalloc(dev, hweight64(control->cn_list), GFP_KERNEL); + control->values = devm_kcalloc(dev, hweight64(control->cn_list), + sizeof(int), GFP_KERNEL); if (!control->values) return -ENOMEM; diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index f7c8c16308de..3848c7df1916 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1277,7 +1277,7 @@ static int is_sdca_endpoint_present(struct device *dev, struct sdw_slave *slave; struct device *sdw_dev; const char *sdw_codec_name; - int i; + int ret, i; dlc = kzalloc(sizeof(*dlc), GFP_KERNEL); if (!dlc) @@ -1307,13 +1307,16 @@ static int is_sdca_endpoint_present(struct device *dev, } slave = dev_to_sdw_dev(sdw_dev); - if (!slave) - return -EINVAL; + if (!slave) { + ret = -EINVAL; + goto put_device; + } /* Make sure BIOS provides SDCA properties */ if (!slave->sdca_data.interface_revision) { dev_warn(&slave->dev, "SDCA properties not found in the BIOS\n"); - return 1; + ret = 1; + goto put_device; } for (i = 0; i < slave->sdca_data.num_functions; i++) { @@ -1322,7 +1325,8 @@ static int is_sdca_endpoint_present(struct device *dev, if (dai_type == dai_info->dai_type) { dev_dbg(&slave->dev, "DAI type %d sdca function %s found\n", dai_type, slave->sdca_data.function[i].name); - return 1; + ret = 1; + goto put_device; } } @@ -1330,7 +1334,11 @@ static int is_sdca_endpoint_present(struct device *dev, "SDCA device function for DAI type %d not supported, skip endpoint\n", dai_info->dai_type); - return 0; + ret = 0; + +put_device: + put_device(sdw_dev); + return ret; } int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card, diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 880f5afcce60..cc15624ecaff 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -1362,6 +1362,11 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip, ep->sample_rem = ep->cur_rate % ep->pps; ep->packsize[0] = ep->cur_rate / ep->pps; ep->packsize[1] = (ep->cur_rate + (ep->pps - 1)) / ep->pps; + if (ep->packsize[1] > ep->maxpacksize) { + usb_audio_dbg(chip, "Too small maxpacksize %u for rate %u / pps %u\n", + ep->maxpacksize, ep->cur_rate, ep->pps); + return -EINVAL; + } /* calculate the frequency in 16.16 format */ ep->freqm = ep->freqn; diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 6f00e0d52382..3af71d42b9b9 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -921,7 +921,7 @@ static int parse_term_uac2_clock_source(struct mixer_build *state, { struct uac_clock_source_descriptor *d = p1; - term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */ + term->type = UAC2_CLOCK_SOURCE << 16; /* virtual type */ term->id = id; term->name = d->iClockSource; return 0; @@ -3086,6 +3086,8 @@ static int snd_usb_mixer_controls_badd(struct usb_mixer_interface *mixer, int i; assoc = usb_ifnum_to_if(dev, ctrlif)->intf_assoc; + if (!assoc) + return -EINVAL; /* Detect BADD capture/playback channels from AS EP descriptors */ for (i = 0; i < assoc->bInterfaceCount; i++) { diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 71638e6dfb20..61bd61ffb1b2 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2022,12 +2022,15 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case USB_ID(0x16d0, 0x09d8): /* NuPrime IDA-8 */ case USB_ID(0x16d0, 0x09db): /* NuPrime Audio DAC-9 */ case USB_ID(0x16d0, 0x09dd): /* Encore mDSD */ + case USB_ID(0x16d0, 0x0ab1): /* PureAudio APA DAC */ + case USB_ID(0x16d0, 0xeca1): /* PureAudio Lotus DAC5, DAC5 SE, DAC5 Pro */ case USB_ID(0x1db5, 0x0003): /* Bryston BDA3 */ case USB_ID(0x20a0, 0x4143): /* WaveIO USB Audio 2.0 */ case USB_ID(0x22e1, 0xca01): /* HDTA Serenade DSD */ case USB_ID(0x249c, 0x9326): /* M2Tech Young MkIII */ case USB_ID(0x2616, 0x0106): /* PS Audio NuWave DAC */ case USB_ID(0x2622, 0x0041): /* Audiolab M-DAC+ */ + case USB_ID(0x2622, 0x0061): /* LEAK Stereo 230 */ case USB_ID(0x278b, 0x5100): /* Rotel RC-1590 */ case USB_ID(0x27f7, 0x3002): /* W4S DAC-2v2SE */ case USB_ID(0x29a2, 0x0086): /* Mutec MC3+ USB */ @@ -2267,6 +2270,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1038, 0x1294, /* SteelSeries Arctis Pro Wireless */ + QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x12d1, 0x3a07, /* Huawei Technologies Co., Ltd. */ @@ -2297,6 +2302,10 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CLOCK_SOURCE), DEVICE_FLG(0x1686, 0x00dd, /* Zoom R16/24 */ QUIRK_FLAG_TX_LENGTH | QUIRK_FLAG_CTL_MSG_DELAY_1M), + DEVICE_FLG(0x16d0, 0x0ab1, /* PureAudio APA DAC */ + QUIRK_FLAG_DSD_RAW), + DEVICE_FLG(0x16d0, 0xeca1, /* PureAudio Lotus DAC5, DAC5 SE and DAC5 Pro */ + QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x17aa, 0x1046, /* Lenovo ThinkStation P620 Rear Line-in, Line-out and Microphone */ QUIRK_FLAG_DISABLE_AUTOSUSPEND), DEVICE_FLG(0x17aa, 0x104d, /* Lenovo ThinkStation P620 Internal Speaker + Front Headset */ @@ -2420,6 +2429,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x25ce, /* Mytek devices */ QUIRK_FLAG_DSD_RAW), + VENDOR_FLG(0x2622, /* IAG Limited devices */ + QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x278b, /* Rotel? */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x292b, /* Gustard/Ess based devices */ diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h index 56d7367ee344..21d8cee04638 100644 --- a/tools/arch/riscv/include/asm/csr.h +++ b/tools/arch/riscv/include/asm/csr.h @@ -167,7 +167,8 @@ #define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT) #define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \ (_AC(1, UL) << IRQ_S_TIMER) | \ - (_AC(1, UL) << IRQ_S_EXT)) + (_AC(1, UL) << IRQ_S_EXT) | \ + (_AC(1, UL) << IRQ_PMU_OVF)) /* AIA CSR bits */ #define TOPI_IID_SHIFT 16 @@ -280,7 +281,7 @@ #define CSR_HPMCOUNTER30H 0xc9e #define CSR_HPMCOUNTER31H 0xc9f -#define CSR_SSCOUNTOVF 0xda0 +#define CSR_SCOUNTOVF 0xda0 #define CSR_SSTATUS 0x100 #define CSR_SIE 0x104 diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 06fc0479a23f..245cf6b3ec57 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -320,7 +320,7 @@ #define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ #define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ #define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ -#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_LKGS (12*32+18) /* Like MOV_GS except MSR_KERNEL_GS_BASE = GS.base */ #define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ @@ -444,6 +444,7 @@ #define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ #define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ #define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_SNP_SECURE_TSC (19*32+ 8) /* SEV-SNP Secure TSC */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ @@ -495,6 +496,9 @@ #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ +#define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ +#define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ /* * BUG word(s) @@ -551,4 +555,5 @@ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ +#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index c683d609934b..8f10f2943370 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index f627196eb796..9e1720d73244 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -315,9 +315,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 + +#define PERF_CAP_LBR_FMT 0x3f #define PERF_CAP_PEBS_TRAP BIT_ULL(6) #define PERF_CAP_ARCH_REG BIT_ULL(7) #define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_FW_WRITES BIT_ULL(13) #define PERF_CAP_PEBS_BASELINE BIT_ULL(14) #define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17) #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ @@ -633,6 +636,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 @@ -701,8 +709,15 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_SAVIC_CONTROL 0xc0010138 +#define MSR_AMD64_SAVIC_EN_BIT 0 +#define MSR_AMD64_SAVIC_EN BIT_ULL(MSR_AMD64_SAVIC_EN_BIT) +#define MSR_AMD64_SAVIC_ALLOWEDNMI_BIT 1 +#define MSR_AMD64_SAVIC_ALLOWEDNMI BIT_ULL(MSR_AMD64_SAVIC_ALLOWEDNMI_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 #define MSR_AMD64_RMP_CFG 0xc0010136 @@ -735,6 +750,7 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET 0xc0000303 /* AMD Hardware Feedback Support MSRs */ #define MSR_AMD_WORKLOAD_CLASS_CONFIG 0xc0000500 @@ -1225,6 +1241,8 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_ABMC_CFG 0xc00003fd +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 0f15d683817d..d420c9c066d4 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -35,6 +35,11 @@ #define MC_VECTOR 18 #define XM_VECTOR 19 #define VE_VECTOR 20 +#define CP_VECTOR 21 + +#define HV_VECTOR 28 +#define VC_VECTOR 29 +#define SX_VECTOR 30 /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT @@ -411,6 +416,35 @@ struct kvm_xcrs { __u64 padding[16]; }; +#define KVM_X86_REG_TYPE_MSR 2 +#define KVM_X86_REG_TYPE_KVM 3 + +#define KVM_X86_KVM_REG_SIZE(reg) \ +({ \ + reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \ +}) + +#define KVM_X86_REG_TYPE_SIZE(type, reg) \ +({ \ + __u64 type_size = (__u64)type << 32; \ + \ + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ + type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \ + 0; \ + type_size; \ +}) + +#define KVM_X86_REG_ID(type, index) \ + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index) + +#define KVM_X86_REG_MSR(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index) +#define KVM_X86_REG_KVM(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) + +/* KVM-defined registers starting from 0 */ +#define KVM_REG_GUEST_SSP 0 + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index 9c640a521a67..650e3256ea7d 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -118,6 +118,10 @@ #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 +#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 +#define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 +#define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index f0f4a4cf84a7..1baa86dfe029 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -93,7 +93,10 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_SEAMCALL 76 #define EXIT_REASON_TDCALL 77 +#define EXIT_REASON_MSR_READ_IMM 84 +#define EXIT_REASON_MSR_WRITE_IMM 85 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -158,7 +161,9 @@ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ { EXIT_REASON_NOTIFY, "NOTIFY" }, \ - { EXIT_REASON_TDCALL, "TDCALL" } + { EXIT_REASON_TDCALL, "TDCALL" }, \ + { EXIT_REASON_MSR_READ_IMM, "MSR_READ_IMM" }, \ + { EXIT_REASON_MSR_WRITE_IMM, "MSR_WRITE_IMM" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk new file mode 100644 index 000000000000..cc4c7a3e6c2e --- /dev/null +++ b/tools/arch/x86/tools/gen-cpu-feature-names-x86.awk @@ -0,0 +1,34 @@ +#!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2025, Oracle and/or its affiliates. +# +# Usage: awk -f gen-cpu-feature-names-x86.awk cpufeatures.h > cpu-feature-names.c +# + +BEGIN { + print "/* cpu feature name array generated from cpufeatures.h */" + print "/* Do not change this code. */" + print + print "static const char *cpu_feature_names[(NCAPINTS+NBUGINTS)*32] = {" + + value_expr = "\\([0-9*+ ]+\\)" +} + +/^#define X86_FEATURE_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[" value "] = \"" $2 "\"," + } +} + +/^#define X86_BUG_/ { + if (match($0, value_expr)) { + value = substr($0, RSTART + 1, RLENGTH - 2) + print "\t[NCAPINTS*32+(" value ")] = \"" $2 "\"," + } +} + +END { + print "};" +} diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 009633294b09..35aeeaf5f711 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -182,7 +182,7 @@ bpftool prog tracelog bpftool prog tracelog { stdout | stderr } *PROG* Dump the BPF stream of the program. BPF programs can write to these streams - at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write + at runtime with the **bpf_stream_vprintk_impl**\ () kfunc. The kernel may write error messages to the standard error stream. This facility should be used only for debugging purposes. diff --git a/tools/build/Build b/tools/build/Build new file mode 100644 index 000000000000..1c7e598e9f59 --- /dev/null +++ b/tools/build/Build @@ -0,0 +1,2 @@ +hostprogs := fixdep +fixdep-y := fixdep.o diff --git a/tools/build/Makefile b/tools/build/Makefile index 63ef21878761..3a5a3808ab2a 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -37,5 +37,22 @@ ifneq ($(wildcard $(TMP_O)),) $(Q)$(MAKE) -C feature OUTPUT=$(TMP_O) clean >/dev/null endif -$(OUTPUT)fixdep: $(srctree)/tools/build/fixdep.c - $(QUIET_CC)$(HOSTCC) $(KBUILD_HOSTCFLAGS) $(KBUILD_HOSTLDFLAGS) -o $@ $< +FIXDEP := $(OUTPUT)fixdep +FIXDEP_IN := $(OUTPUT)fixdep-in.o + +# To track fixdep's dependencies properly, fixdep needs to run on itself. +# Build it twice the first time. +$(FIXDEP_IN): FORCE + $(Q)if [ ! -f $(FIXDEP) ]; then \ + $(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)"; \ + rm -f $(FIXDEP).o; \ + fi + $(Q)$(MAKE) $(build)=fixdep HOSTCFLAGS="$(KBUILD_HOSTCFLAGS)" + + +$(FIXDEP): $(FIXDEP_IN) + $(QUIET_LINK)$(HOSTCC) $(FIXDEP_IN) $(KBUILD_HOSTLDFLAGS) -o $@ + +FORCE: + +.PHONY: FORCE diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 32bbe29fe5f6..300a329bc581 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -315,5 +315,7 @@ endef ifeq ($(FEATURE_DISPLAY_DEFERRED),) $(call feature_display_entries) - $(info ) + ifeq ($(feature_display),1) + $(info ) + endif endif diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 49b0add392b1..95646290cb89 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -107,7 +107,7 @@ all: $(FILES) __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS) BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1 BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl - BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd + BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS) BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1 @@ -115,7 +115,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$( ############################### $(OUTPUT)test-all.bin: - $(BUILD_ALL) || $(BUILD_ALL) -lopcodes -liberty + $(BUILD_ALL) $(OUTPUT)test-hello.bin: $(BUILD) diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h index e974ec932ec1..35f33780ca6c 100644 --- a/tools/include/asm-generic/bitops/__fls.h +++ b/tools/include/asm-generic/bitops/__fls.h @@ -10,7 +10,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word) { unsigned int num = BITS_PER_LONG - 1; diff --git a/tools/include/asm-generic/bitops/fls.h b/tools/include/asm-generic/bitops/fls.h index 26f3ce1dd6e4..8eed3437edb9 100644 --- a/tools/include/asm-generic/bitops/fls.h +++ b/tools/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int generic_fls(unsigned int x) +static __always_inline __attribute_const__ int generic_fls(unsigned int x) { int r = 32; diff --git a/tools/include/asm-generic/bitops/fls64.h b/tools/include/asm-generic/bitops/fls64.h index 866f2b2304ff..b5f58dd261a3 100644 --- a/tools/include/asm-generic/bitops/fls64.h +++ b/tools/include/asm-generic/bitops/fls64.h @@ -16,7 +16,7 @@ * at position 64. */ #if BITS_PER_LONG == 32 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { __u32 h = x >> 32; if (h) @@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x) return fls(x); } #elif BITS_PER_LONG == 64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { if (x == 0) return 0; diff --git a/tools/include/linux/interval_tree_generic.h b/tools/include/linux/interval_tree_generic.h index aaa8a0767aa3..c5a2fed49eb0 100644 --- a/tools/include/linux/interval_tree_generic.h +++ b/tools/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ @@ -104,12 +104,8 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ if (ITSTART(node) <= last) { /* Cond1 */ \ if (start <= ITLAST(node)) /* Cond2 */ \ return node; /* node is leftmost match */ \ - if (node->ITRB.rb_right) { \ - node = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB); \ - if (start <= node->ITSUBTREE) \ - continue; \ - } \ + node = rb_entry(node->ITRB.rb_right, ITSTRUCT, ITRB); \ + continue; \ } \ return NULL; /* No match */ \ } \ diff --git a/tools/include/linux/livepatch_external.h b/tools/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/tools/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include <linux/types.h> + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/string.h b/tools/include/linux/string.h index 8499f509f03e..51ad3cf4fa82 100644 --- a/tools/include/linux/string.h +++ b/tools/include/linux/string.h @@ -44,6 +44,20 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } +/* + * Checks if a string ends with another. + */ +static inline bool str_ends_with(const char *str, const char *substr) +{ + size_t len = strlen(str); + size_t sublen = strlen(substr); + + if (sublen > len) + return false; + + return !strcmp(str + len - sublen, substr); +} + extern char * __must_check skip_spaces(const char *); extern char *strim(char *); diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index e63a71d3c607..3cd5cf15e3c9 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -597,35 +597,66 @@ struct drm_set_version { int drm_dd_minor; }; -/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/** + * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. + * @handle: Handle of the object to be closed. + * @pad: Padding. + * + * Releases the handle to an mm object. + */ struct drm_gem_close { - /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +/** + * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. + * @handle: Handle for the object being named. + * @name: Returned global name. + * + * Create a global name for an object, returning the name. + * + * Note that the name does not hold a reference; when the object + * is freed, the name goes away. + */ struct drm_gem_flink { - /** Handle for the object being named */ __u32 handle; - - /** Returned global name */ __u32 name; }; -/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +/** + * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. + * @name: Name of object being opened. + * @handle: Returned handle for the object. + * @size: Returned size of the object + * + * Open an object using the global name, returning a handle and the size. + * + * This handle (of course) holds a reference to the object, so the object + * will not go away until the handle is deleted. + */ struct drm_gem_open { - /** Name of object being opened */ __u32 name; - - /** Returned handle for the object */ __u32 handle; - - /** Returned size of the object */ __u64 size; }; /** + * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. + * @handle: The handle of a gem object. + * @new_handle: An available gem handle. + * + * This ioctl changes the handle of a GEM object to the specified one. + * The new handle must be unused. On success the old handle is closed + * and all further IOCTL should refer to the new handle only. + * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. + */ +struct drm_gem_change_handle { + __u32 handle; + __u32 new_handle; +}; + +/** * DRM_CAP_DUMB_BUFFER * * If set to 1, the driver supports creating dumb buffers via the @@ -1309,6 +1340,14 @@ extern "C" { */ #define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) +/** + * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle + * + * Some applications (notably CRIU) need objects to have specific gem handles. + * This ioctl changes the object at one gem handle to use a new gem handle. + */ +#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) + /* * Device specific ioctls should only be in their respective headers * The device specific ioctl range is from 0x40 to 0x9f. diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f0f0d49d2544..52f6000ab020 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -962,6 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 +#define KVM_CAP_GUEST_MEMFD_FLAGS 244 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1598,6 +1599,8 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) struct kvm_create_guest_memfd { __u64 size; diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 33c9b578b3b2..a25e38d1c874 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -53,6 +53,76 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index 80c028540656..d4e4e388e625 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,20 +315,20 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; - -#define bpf_stream_printk(stream_id, fmt, args...) \ -({ \ - static const char ___fmt[] = fmt; \ - unsigned long long ___param[___bpf_narg(args)]; \ - \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - ___bpf_fill(___param, args); \ - _Pragma("GCC diagnostic pop") \ - \ - bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\ +extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz, void *aux__prog) __weak __ksym; + +#define bpf_stream_printk(stream_id, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 58086b101057..aadeb3abcad8 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -861,6 +861,18 @@ class TypeIndexedArray(Type): return [f"{member} = {self.c_name};", f"{presence} = n_{self.c_name};"] + def free_needs_iter(self): + return self.sub_type == 'nest' + + def _free_lines(self, ri, var, ref): + lines = [] + if self.sub_type == 'nest': + lines += [ + f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)", + f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', + ] + lines += f"free({var}->{ref}{self.c_name});", + return lines class TypeNestTypeValue(Type): def _complex_member_type(self, ri): diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 4faa4dd72f35..73d883128511 100644 --- a/tools/objtool/.gitignore +++ b/tools/objtool/.gitignore @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +arch/x86/lib/cpu-feature-names.c arch/x86/lib/inat-tables.c /objtool +feature +FEATURE-DUMP.objtool fixdep libsubcmd/ diff --git a/tools/objtool/Build b/tools/objtool/Build index a3cdf8af6635..9982e665d58d 100644 --- a/tools/objtool/Build +++ b/tools/objtool/Build @@ -8,8 +8,11 @@ objtool-y += builtin-check.o objtool-y += elf.o objtool-y += objtool.o -objtool-$(BUILD_ORC) += orc_gen.o -objtool-$(BUILD_ORC) += orc_dump.o +objtool-$(BUILD_DISAS) += disas.o +objtool-$(BUILD_DISAS) += trace.o + +objtool-$(BUILD_ORC) += orc_gen.o orc_dump.o +objtool-$(BUILD_KLP) += builtin-klp.o klp-diff.o klp-post-link.o objtool-y += libstring.o objtool-y += libctype.o diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 8c20361dd100..ad6e1ec706ce 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -2,6 +2,28 @@ include ../scripts/Makefile.include include ../scripts/Makefile.arch +ifeq ($(SRCARCH),x86) + BUILD_ORC := y + ARCH_HAS_KLP := y +endif + +ifeq ($(SRCARCH),loongarch) + BUILD_ORC := y +endif + +ifeq ($(ARCH_HAS_KLP),y) + HAVE_XXHASH = $(shell printf "$(pound)include <xxhash.h>\nXXH3_state_t *state;int main() {}" | \ + $(HOSTCC) -xc - -o /dev/null -lxxhash 2> /dev/null && echo y || echo n) + ifeq ($(HAVE_XXHASH),y) + BUILD_KLP := y + LIBXXHASH_CFLAGS := $(shell $(HOSTPKG_CONFIG) libxxhash --cflags 2>/dev/null) \ + -DBUILD_KLP + LIBXXHASH_LIBS := $(shell $(HOSTPKG_CONFIG) libxxhash --libs 2>/dev/null || echo -lxxhash) + endif +endif + +export BUILD_ORC BUILD_KLP + ifeq ($(srctree),) srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) @@ -23,6 +45,11 @@ LIBELF_LIBS := $(shell $(HOSTPKG_CONFIG) libelf --libs 2>/dev/null || echo -lel all: $(OBJTOOL) +WARNINGS := -Werror -Wall -Wextra -Wmissing-prototypes \ + -Wmissing-declarations -Wwrite-strings \ + -Wno-implicit-fallthrough -Wno-sign-compare \ + -Wno-unused-parameter + INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/include/uapi \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ @@ -30,11 +57,11 @@ INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/objtool/include \ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include \ -I$(LIBSUBCMD_OUTPUT)/include -# Note, EXTRA_WARNINGS here was determined for CC and not HOSTCC, it -# is passed here to match a legacy behavior. -WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs -OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) -OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) + +OBJTOOL_CFLAGS := -std=gnu11 -fomit-frame-pointer -O2 -g $(WARNINGS) \ + $(INCLUDES) $(LIBELF_FLAGS) $(LIBXXHASH_CFLAGS) $(HOSTCFLAGS) + +OBJTOOL_LDFLAGS := $(LIBSUBCMD) $(LIBELF_LIBS) $(LIBXXHASH_LIBS) $(HOSTLDFLAGS) # Allow old libelf to be used: elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - 2>/dev/null | grep elf_getshdr) @@ -43,20 +70,32 @@ OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) # Always want host compilation. HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" -AWK = awk -MKDIR = mkdir +# +# To support disassembly, objtool needs libopcodes which is provided +# with libbdf (binutils-dev or binutils-devel package). +# +FEATURE_USER = .objtool +FEATURE_TESTS = libbfd disassembler-init-styled +FEATURE_DISPLAY = +include $(srctree)/tools/build/Makefile.feature + +ifeq ($(feature-disassembler-init-styled), 1) + OBJTOOL_CFLAGS += -DDISASM_INIT_STYLED +endif -BUILD_ORC := n +BUILD_DISAS := n -ifeq ($(SRCARCH),x86) - BUILD_ORC := y +ifeq ($(feature-libbfd),1) + BUILD_DISAS := y + OBJTOOL_CFLAGS += -DDISAS -DPACKAGE="objtool" + OBJTOOL_LDFLAGS += -lopcodes endif -ifeq ($(SRCARCH),loongarch) - BUILD_ORC := y -endif +export BUILD_DISAS + +AWK = awk +MKDIR = mkdir -export BUILD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include @@ -86,7 +125,10 @@ $(LIBSUBCMD)-clean: clean: $(LIBSUBCMD)-clean $(call QUIET_CLEAN, objtool) $(RM) $(OBJTOOL) $(Q)find $(OUTPUT) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete + $(Q)$(RM) $(OUTPUT)arch/x86/lib/cpu-feature-names.c $(OUTPUT)fixdep $(Q)$(RM) $(OUTPUT)arch/x86/lib/inat-tables.c $(OUTPUT)fixdep + $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.objtool + $(Q)$(RM) -r -- $(OUTPUT)feature FORCE: diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 2e555c4060c5..6cd288150f49 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -1,13 +1,25 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <string.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/warn.h> #include <asm/inst.h> #include <asm/orc_types.h> #include <linux/objtool_types.h> #include <arch/elf.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "zero", "ra", "tp", "sp", + "a0", "a1", "a2", "a3", + "a4", "a5", "a6", "a7", + "t0", "t1", "t2", "t3", + "t4", "t5", "t6", "t7", + "t8", "u0", "fp", "s0", + "s1", "s2", "s3", "s4", + "s5", "s6", "s7", "s8" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); } @@ -17,9 +29,9 @@ unsigned long arch_jump_destination(struct instruction *insn) return insn->offset + (insn->immediate << 2); } -unsigned long arch_dest_reloc_offset(int addend) +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) { - return addend; + return reloc_addend(reloc); } bool arch_pc_relative_reloc(struct reloc *reloc) @@ -414,3 +426,14 @@ unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *tabl return reloc->sym->offset + reloc_addend(reloc); } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_loongarch, + bfd_mach_loongarch32, bfd_mach_loongarch64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/loongarch/orc.c b/tools/objtool/arch/loongarch/orc.c index b58c5ff443c9..ffd3a3c858ae 100644 --- a/tools/objtool/arch/loongarch/orc.c +++ b/tools/objtool/arch/loongarch/orc.c @@ -5,7 +5,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) { diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index a80b75f7b061..aba774109437 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -194,3 +194,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index c851c51d4bd3..e534ac1123b3 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -3,20 +3,32 @@ #include <stdio.h> #include <stdlib.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> #include <objtool/builtin.h> -#include <objtool/endianness.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "r0", "sp", "r2", "r3", + "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "r16", "r17", "r18", "r19", + "r20", "r21", "r22", "r23", + "r24", "r25", "r26", "r27", + "r28", "r29", "r30", "r31", + "ra" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "_mcount"); } -unsigned long arch_dest_reloc_offset(int addend) +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) { - return addend; + return reloc_addend(reloc); } bool arch_callee_saved_reg(unsigned char reg) @@ -128,3 +140,14 @@ unsigned int arch_reloc_size(struct reloc *reloc) return 8; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_powerpc, + bfd_mach_ppc, bfd_mach_ppc64, + NULL); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c index 51610689abf7..8f9bf61ca089 100644 --- a/tools/objtool/arch/powerpc/special.c +++ b/tools/objtool/arch/powerpc/special.c @@ -18,3 +18,8 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, { exit(-1); } + +const char *arch_cpu_feature_name(int feature_number) +{ + return NULL; +} diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build index 3dedb2fd8f3a..febee0b8ee0b 100644 --- a/tools/objtool/arch/x86/Build +++ b/tools/objtool/arch/x86/Build @@ -1,5 +1,5 @@ -objtool-y += special.o objtool-y += decode.o +objtool-y += special.o objtool-y += orc.o inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk @@ -12,3 +12,14 @@ $(OUTPUT)arch/x86/lib/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(OUTPUT)arch/x86/decode.o: $(OUTPUT)arch/x86/lib/inat-tables.c CFLAGS_decode.o += -I$(OUTPUT)arch/x86/lib + +cpu_features = ../arch/x86/include/asm/cpufeatures.h +cpu_features_script = ../arch/x86/tools/gen-cpu-feature-names-x86.awk + +$(OUTPUT)arch/x86/lib/cpu-feature-names.c: $(cpu_features_script) $(cpu_features) + $(call rule_mkdir) + $(Q)$(call echo-cmd,gen)$(AWK) -f $(cpu_features_script) $(cpu_features) > $@ + +$(OUTPUT)arch/x86/special.o: $(OUTPUT)arch/x86/lib/cpu-feature-names.c + +CFLAGS_special.o += -I$(OUTPUT)arch/x86/lib diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 0ad5cc70ecbe..f4af82508228 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -16,14 +16,22 @@ #include <asm/orc_types.h> #include <objtool/check.h> +#include <objtool/disas.h> #include <objtool/elf.h> #include <objtool/arch.h> #include <objtool/warn.h> -#include <objtool/endianness.h> #include <objtool/builtin.h> #include <arch/elf.h> -int arch_ftrace_match(char *name) +const char *arch_reg_name[CFI_NUM_REGS] = { + "rax", "rcx", "rdx", "rbx", + "rsp", "rbp", "rsi", "rdi", + "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", + "ra" +}; + +int arch_ftrace_match(const char *name) { return !strcmp(name, "__fentry__"); } @@ -68,9 +76,65 @@ bool arch_callee_saved_reg(unsigned char reg) } } -unsigned long arch_dest_reloc_offset(int addend) +/* Undo the effects of __pa_symbol() if necessary */ +static unsigned long phys_to_virt(unsigned long pa) +{ + s64 va = pa; + + if (va > 0) + va &= ~(0x80000000); + + return va; +} + +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc) +{ + s64 addend = reloc_addend(reloc); + + if (arch_pc_relative_reloc(reloc)) + addend += insn->offset + insn->len - reloc_offset(reloc); + + return phys_to_virt(addend); +} + +static void scan_for_insn(struct section *sec, unsigned long offset, + unsigned long *insn_off, unsigned int *insn_len) { - return addend + 4; + unsigned long o = 0; + struct insn insn; + + while (1) { + + insn_decode(&insn, sec->data->d_buf + o, sec_size(sec) - o, + INSN_MODE_64); + + if (o + insn.length > offset) { + *insn_off = o; + *insn_len = insn.length; + return; + } + + o += insn.length; + } +} + +u64 arch_adjusted_addend(struct reloc *reloc) +{ + unsigned int type = reloc_type(reloc); + s64 addend = reloc_addend(reloc); + unsigned long insn_off; + unsigned int insn_len; + + if (type == R_X86_64_PLT32) + return addend + 4; + + if (type != R_X86_64_PC32 || !is_text_sec(reloc->sec->base)) + return addend; + + scan_for_insn(reloc->sec->base, reloc_offset(reloc), + &insn_off, &insn_len); + + return addend + insn_off + insn_len - reloc_offset(reloc); } unsigned long arch_jump_destination(struct instruction *insn) @@ -189,15 +253,6 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec op2 = ins.opcode.bytes[1]; op3 = ins.opcode.bytes[2]; - /* - * XXX hack, decoder is buggered and thinks 0xea is 7 bytes long. - */ - if (op1 == 0xea) { - insn->len = 1; - insn->type = INSN_BUG; - return 0; - } - if (ins.rex_prefix.nbytes) { rex = ins.rex_prefix.bytes[0]; rex_w = X86_REX_W(rex) >> 3; @@ -503,6 +558,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec break; case 0x90: + if (rex_b) /* XCHG %r8, %rax */ + break; + + if (prefix == 0xf3) /* REP NOP := PAUSE */ + break; + insn->type = INSN_NOP; break; @@ -556,13 +617,14 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec } else if (op2 == 0x0b || op2 == 0xb9) { - /* ud2 */ + /* ud2, ud1 */ insn->type = INSN_BUG; - } else if (op2 == 0x0d || op2 == 0x1f) { + } else if (op2 == 0x1f) { - /* nopl/nopw */ - insn->type = INSN_NOP; + /* 0f 1f /0 := NOPL */ + if (modrm_reg == 0) + insn->type = INSN_NOP; } else if (op2 == 0x1e) { @@ -692,6 +754,10 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec insn->type = INSN_SYSRET; break; + case 0xd6: /* udb */ + insn->type = INSN_BUG; + break; + case 0xe0: /* loopne */ case 0xe1: /* loope */ case 0xe2: /* loop */ @@ -892,3 +958,14 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) return false; } } + +#ifdef DISAS + +int arch_disas_info_init(struct disassemble_info *dinfo) +{ + return disas_info_init(dinfo, bfd_arch_i386, + bfd_mach_i386_i386, bfd_mach_x86_64, + "att"); +} + +#endif /* DISAS */ diff --git a/tools/objtool/arch/x86/orc.c b/tools/objtool/arch/x86/orc.c index 7176b9ec5b05..735e150ca6b7 100644 --- a/tools/objtool/arch/x86/orc.c +++ b/tools/objtool/arch/x86/orc.c @@ -5,7 +5,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) { diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c index 06ca4a2659a4..e817a3fff449 100644 --- a/tools/objtool/arch/x86/special.c +++ b/tools/objtool/arch/x86/special.c @@ -4,6 +4,10 @@ #include <objtool/special.h> #include <objtool/builtin.h> #include <objtool/warn.h> +#include <asm/cpufeatures.h> + +/* cpu feature name array generated from cpufeatures.h */ +#include "cpu-feature-names.c" void arch_handle_alternative(struct special_alt *alt) { @@ -89,7 +93,7 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, /* look for a relocation which references .rodata */ text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!text_reloc || text_reloc->sym->type != STT_SECTION || + if (!text_reloc || !is_sec_sym(text_reloc->sym) || !text_reloc->sym->sec->rodata) return NULL; @@ -134,3 +138,9 @@ struct reloc *arch_find_switch_table(struct objtool_file *file, *table_size = 0; return rodata_reloc; } + +const char *arch_cpu_feature_name(int feature_number) +{ + return (feature_number < ARRAY_SIZE(cpu_feature_names)) ? + cpu_feature_names[feature_number] : NULL; +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 0f6b197cfcb0..b780df513715 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -73,35 +73,41 @@ static int parse_hacks(const struct option *opt, const char *str, int unset) static const struct option check_options[] = { OPT_GROUP("Actions:"), + OPT_BOOLEAN(0, "checksum", &opts.checksum, "generate per-function checksums"), + OPT_BOOLEAN(0, "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), + OPT_STRING_OPTARG('d', "disas", &opts.disas, "function-pattern", "disassemble functions", "*"), OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), - OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), - OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), - OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), - OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), - OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), - OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), - OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), - OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), - OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), - OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), - OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), - OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), - OPT_BOOLEAN(0 , "cfi", &opts.cfi, "annotate kernel control flow integrity (kCFI) function preambles"), - OPT_BOOLEAN(0 , "noabs", &opts.noabs, "reject absolute references in allocatable sections"), - OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), + OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), + OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), + OPT_BOOLEAN(0, "noabs", &opts.noabs, "reject absolute references in allocatable sections"), + OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), + OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), + OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), + OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), + OPT_INTEGER(0, "prefix", &opts.prefix, "generate prefix symbols"), + OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), + OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), + OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), + OPT_BOOLEAN('u', "uaccess", &opts.uaccess, "validate uaccess rules for SMAP"), + OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), - OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), - OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), - OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), - OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), - OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), - OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), - OPT_STRING('o', "output", &opts.output, "file", "output file name"), - OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), - OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), - OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), - OPT_BOOLEAN(0, "Werror", &opts.werror, "return error on warnings"), + OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), + OPT_BOOLEAN(0, "backup", &opts.backup, "create backup (.orig) file on warning/error"), + OPT_STRING(0, "debug-checksum", &opts.debug_checksum, "funcs", "enable checksum debug output"), + OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), + OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), + OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), + OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), + OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), + OPT_STRING('o', "output", &opts.output, "file", "output file name"), + OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), + OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), + OPT_STRING(0, "trace", &opts.trace, "func", "trace function validation"), + OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), + OPT_BOOLEAN(0, "werror", &opts.werror, "return error on warnings"), + OPT_BOOLEAN(0, "wide", &opts.wide, "wide output"), OPT_END(), }; @@ -159,7 +165,21 @@ static bool opts_valid(void) return false; } - if (opts.hack_jump_label || +#ifndef BUILD_KLP + if (opts.checksum) { + ERROR("--checksum not supported; install xxhash-devel/libxxhash-dev (version >= 0.8) and recompile"); + return false; + } +#endif + + if (opts.debug_checksum && !opts.checksum) { + ERROR("--debug-checksum requires --checksum"); + return false; + } + + if (opts.checksum || + opts.disas || + opts.hack_jump_label || opts.hack_noinstr || opts.ibt || opts.mcount || @@ -243,15 +263,12 @@ static void save_argv(int argc, const char **argv) ERROR_GLIBC("strdup(%s)", argv[i]); exit(1); } - }; + } } -void print_args(void) +int make_backup(void) { - char *backup = NULL; - - if (opts.output || opts.dryrun) - goto print; + char *backup; /* * Make a backup before kbuild deletes the file so the error @@ -260,33 +277,32 @@ void print_args(void) backup = malloc(strlen(objname) + strlen(ORIG_SUFFIX) + 1); if (!backup) { ERROR_GLIBC("malloc"); - goto print; + return 1; } strcpy(backup, objname); strcat(backup, ORIG_SUFFIX); - if (copy_file(objname, backup)) { - backup = NULL; - goto print; - } + if (copy_file(objname, backup)) + return 1; -print: /* - * Print the cmdline args to make it easier to recreate. If '--output' - * wasn't used, add it to the printed args with the backup as input. + * Print the cmdline args to make it easier to recreate. */ + fprintf(stderr, "%s", orig_argv[0]); for (int i = 1; i < orig_argc; i++) { char *arg = orig_argv[i]; - if (backup && !strcmp(arg, objname)) + /* Modify the printed args to use the backup */ + if (!opts.output && !strcmp(arg, objname)) fprintf(stderr, " %s -o %s", backup, objname); else fprintf(stderr, " %s", arg); } fprintf(stderr, "\n"); + return 0; } int objtool_run(int argc, const char **argv) @@ -332,5 +348,5 @@ int objtool_run(int argc, const char **argv) if (!opts.dryrun && file->elf->changed && elf_write(file->elf)) return 1; - return 0; + return elf_close(file->elf); } diff --git a/tools/objtool/builtin-klp.c b/tools/objtool/builtin-klp.c new file mode 100644 index 000000000000..56d5a5b92f72 --- /dev/null +++ b/tools/objtool/builtin-klp.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <subcmd/parse-options.h> +#include <string.h> +#include <stdlib.h> +#include <objtool/builtin.h> +#include <objtool/objtool.h> +#include <objtool/klp.h> + +struct subcmd { + const char *name; + const char *description; + int (*fn)(int, const char **); +}; + +static struct subcmd subcmds[] = { + { "diff", "Generate binary diff of two object files", cmd_klp_diff, }, + { "post-link", "Finalize klp symbols/relocs after module linking", cmd_klp_post_link, }, +}; + +static void cmd_klp_usage(void) +{ + fprintf(stderr, "usage: objtool klp <subcommand> [<options>]\n\n"); + fprintf(stderr, "Subcommands:\n"); + + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + fprintf(stderr, " %s\t%s\n", cmd->name, cmd->description); + } + + exit(1); +} + +int cmd_klp(int argc, const char **argv) +{ + argc--; + argv++; + + if (!argc) + cmd_klp_usage(); + + if (argc) { + for (int i = 0; i < ARRAY_SIZE(subcmds); i++) { + struct subcmd *cmd = &subcmds[i]; + + if (!strcmp(cmd->name, argv[0])) + return cmd->fn(argc, argv); + } + } + + cmd_klp_usage(); + return 0; +} diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9004fbc06769..9ec0e07cce90 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3,6 +3,8 @@ * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com> */ +#define _GNU_SOURCE /* memmem() */ +#include <fnmatch.h> #include <string.h> #include <stdlib.h> #include <inttypes.h> @@ -11,10 +13,13 @@ #include <objtool/builtin.h> #include <objtool/cfi.h> #include <objtool/arch.h> +#include <objtool/disas.h> #include <objtool/check.h> #include <objtool/special.h> +#include <objtool/trace.h> #include <objtool/warn.h> -#include <objtool/endianness.h> +#include <objtool/checksum.h> +#include <objtool/util.h> #include <linux/objtool_types.h> #include <linux/hashtable.h> @@ -22,11 +27,6 @@ #include <linux/static_call_types.h> #include <linux/string.h> -struct alternative { - struct alternative *next; - struct instruction *insn; -}; - static unsigned long nr_cfi, nr_cfi_reused, nr_cfi_cache; static struct cfi_init_state initial_func_cfi; @@ -34,6 +34,10 @@ static struct cfi_state init_cfi; static struct cfi_state func_cfi; static struct cfi_state force_undefined_cfi; +struct disas_context *objtool_disas_ctx; + +size_t sym_name_max_len; + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset) { @@ -106,7 +110,7 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, #define for_each_insn(file, insn) \ for (struct section *__sec, *__fake = (struct section *)1; \ __fake; __fake = NULL) \ - for_each_sec(file, __sec) \ + for_each_sec(file->elf, __sec) \ sec_for_each_insn(file, __sec, insn) #define func_for_each_insn(file, func, insn) \ @@ -131,15 +135,6 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) -static inline struct symbol *insn_call_dest(struct instruction *insn) -{ - if (insn->type == INSN_JUMP_DYNAMIC || - insn->type == INSN_CALL_DYNAMIC) - return NULL; - - return insn->_call_dest; -} - static inline struct reloc *insn_jump_table(struct instruction *insn) { if (insn->type == INSN_JUMP_DYNAMIC || @@ -186,20 +181,6 @@ static bool is_sibling_call(struct instruction *insn) } /* - * Checks if a string ends with another. - */ -static bool str_ends_with(const char *s, const char *sub) -{ - const int slen = strlen(s); - const int sublen = strlen(sub); - - if (sublen > slen) - return 0; - - return !memcmp(s + slen - sublen, sub, sublen); -} - -/* * Checks if a function is a Rust "noreturn" one. */ static bool is_rust_noreturn(const struct symbol *func) @@ -262,7 +243,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, if (!func) return false; - if (func->bind == STB_GLOBAL || func->bind == STB_WEAK) { + if (!is_local_sym(func)) { if (is_rust_noreturn(func)) return true; @@ -271,7 +252,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return true; } - if (func->bind == STB_WEAK) + if (is_weak_sym(func)) return false; if (!func->len) @@ -431,14 +412,13 @@ static int decode_instructions(struct objtool_file *file) struct symbol *func; unsigned long offset; struct instruction *insn; - int ret; - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { struct instruction *insns = NULL; u8 prev_len = 0; u8 idx = 0; - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + if (!is_text_sec(sec)) continue; if (strcmp(sec->name, ".altinstr_replacement") && @@ -461,9 +441,9 @@ static int decode_instructions(struct objtool_file *file) if (!strcmp(sec->name, ".init.text") && !opts.module) sec->init = true; - for (offset = 0; offset < sec->sh.sh_size; offset += insn->len) { + for (offset = 0; offset < sec_size(sec); offset += insn->len) { if (!insns || idx == INSN_CHUNK_MAX) { - insns = calloc(sizeof(*insn), INSN_CHUNK_SIZE); + insns = calloc(INSN_CHUNK_SIZE, sizeof(*insn)); if (!insns) { ERROR_GLIBC("calloc"); return -1; @@ -480,11 +460,8 @@ static int decode_instructions(struct objtool_file *file) insn->offset = offset; insn->prev_len = prev_len; - ret = arch_decode_instruction(file, sec, offset, - sec->sh.sh_size - offset, - insn); - if (ret) - return ret; + if (arch_decode_instruction(file, sec, offset, sec_size(sec) - offset, insn)) + return -1; prev_len = insn->len; @@ -501,12 +478,12 @@ static int decode_instructions(struct objtool_file *file) } sec_for_each_sym(sec, func) { - if (func->type != STT_NOTYPE && func->type != STT_FUNC) + if (!is_notype_sym(func) && !is_func_sym(func)) continue; - if (func->offset == sec->sh.sh_size) { + if (func->offset == sec_size(sec)) { /* Heuristic: likely an "end" symbol */ - if (func->type == STT_NOTYPE) + if (is_notype_sym(func)) continue; ERROR("%s(): STT_FUNC at end of section", func->name); return -1; @@ -522,7 +499,7 @@ static int decode_instructions(struct objtool_file *file) sym_for_each_insn(file, func, insn) { insn->sym = func; - if (func->type == STT_FUNC && + if (is_func_sym(func) && insn->type == INSN_ENDBR && list_empty(&insn->call_node)) { if (insn->offset == func->offset) { @@ -566,7 +543,7 @@ static int add_pv_ops(struct objtool_file *file, const char *symname) idx = (reloc_offset(reloc) - sym->offset) / sizeof(unsigned long); func = reloc->sym; - if (func->type == STT_SECTION) + if (is_sec_sym(func)) func = find_symbol_by_offset(reloc->sym->sec, reloc_addend(reloc)); if (!func) { @@ -600,7 +577,7 @@ static int init_pv_ops(struct objtool_file *file) }; const char *pv_ops; struct symbol *sym; - int idx, nr, ret; + int idx, nr; if (!opts.noinstr) return 0; @@ -612,7 +589,7 @@ static int init_pv_ops(struct objtool_file *file) return 0; nr = sym->len / sizeof(unsigned long); - file->pv_ops = calloc(sizeof(struct pv_state), nr); + file->pv_ops = calloc(nr, sizeof(struct pv_state)); if (!file->pv_ops) { ERROR_GLIBC("calloc"); return -1; @@ -622,14 +599,27 @@ static int init_pv_ops(struct objtool_file *file) INIT_LIST_HEAD(&file->pv_ops[idx].targets); for (idx = 0; (pv_ops = pv_ops_tables[idx]); idx++) { - ret = add_pv_ops(file, pv_ops); - if (ret) - return ret; + if (add_pv_ops(file, pv_ops)) + return -1; } return 0; } +static bool is_livepatch_module(struct objtool_file *file) +{ + struct section *sec; + + if (!opts.module) + return false; + + sec = find_section_by_name(file->elf, ".modinfo"); + if (!sec) + return false; + + return memmem(sec->data->d_buf, sec_size(sec), "\0livepatch=Y", 12); +} + static int create_static_call_sections(struct objtool_file *file) { struct static_call_site *site; @@ -641,8 +631,14 @@ static int create_static_call_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".static_call_sites"); if (sec) { - INIT_LIST_HEAD(&file->static_call_list); - WARN("file already has .static_call_sites section, skipping"); + /* + * Livepatch modules may have already extracted the static call + * site entries to take advantage of vmlinux static call + * privileges. + */ + if (!file->klp) + WARN("file already has .static_call_sites section, skipping"); + return 0; } @@ -686,7 +682,7 @@ static int create_static_call_sections(struct objtool_file *file) key_sym = find_symbol_by_name(file->elf, tmp); if (!key_sym) { - if (!opts.module) { + if (!opts.module || file->klp) { ERROR("static_call: can't find static_call_key symbol: %s", tmp); return -1; } @@ -829,7 +825,7 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) struct symbol *sym = insn->sym; *site = 0; - if (opts.module && sym && sym->type == STT_FUNC && + if (opts.module && sym && is_func_sym(sym) && insn->offset == sym->offset && (!strcmp(sym->name, "init_module") || !strcmp(sym->name, "cleanup_module"))) { @@ -857,14 +853,13 @@ static int create_cfi_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".cfi_sites"); if (sec) { - INIT_LIST_HEAD(&file->call_list); WARN("file already has .cfi_sites section, skipping"); return 0; } idx = 0; - for_each_sym(file, sym) { - if (sym->type != STT_FUNC) + for_each_sym(file->elf, sym) { + if (!is_func_sym(sym)) continue; if (strncmp(sym->name, "__cfi_", 6)) @@ -879,8 +874,8 @@ static int create_cfi_sections(struct objtool_file *file) return -1; idx = 0; - for_each_sym(file, sym) { - if (sym->type != STT_FUNC) + for_each_sym(file->elf, sym) { + if (!is_func_sym(sym)) continue; if (strncmp(sym->name, "__cfi_", 6)) @@ -906,8 +901,13 @@ static int create_mcount_loc_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, "__mcount_loc"); if (sec) { - INIT_LIST_HEAD(&file->mcount_loc_list); - WARN("file already has __mcount_loc section, skipping"); + /* + * Livepatch modules have already extracted their __mcount_loc + * entries to cover the !CONFIG_FTRACE_MCOUNT_USE_OBJTOOL case. + */ + if (!file->klp) + WARN("file already has __mcount_loc section, skipping"); + return 0; } @@ -951,7 +951,6 @@ static int create_direct_call_sections(struct objtool_file *file) sec = find_section_by_name(file->elf, ".call_sites"); if (sec) { - INIT_LIST_HEAD(&file->call_list); WARN("file already has .call_sites section, skipping"); return 0; } @@ -982,6 +981,59 @@ static int create_direct_call_sections(struct objtool_file *file) return 0; } +#ifdef BUILD_KLP +static int create_sym_checksum_section(struct objtool_file *file) +{ + struct section *sec; + struct symbol *sym; + unsigned int idx = 0; + struct sym_checksum *checksum; + size_t entsize = sizeof(struct sym_checksum); + + sec = find_section_by_name(file->elf, ".discard.sym_checksum"); + if (sec) { + if (!opts.dryrun) + WARN("file already has .discard.sym_checksum section, skipping"); + + return 0; + } + + for_each_sym(file->elf, sym) + if (sym->csum.checksum) + idx++; + + if (!idx) + return 0; + + sec = elf_create_section_pair(file->elf, ".discard.sym_checksum", entsize, + idx, idx); + if (!sec) + return -1; + + idx = 0; + for_each_sym(file->elf, sym) { + if (!sym->csum.checksum) + continue; + + if (!elf_init_reloc(file->elf, sec->rsec, idx, idx * entsize, + sym, 0, R_TEXT64)) + return -1; + + checksum = (struct sym_checksum *)sec->data->d_buf + idx; + checksum->addr = 0; /* reloc */ + checksum->checksum = sym->csum.checksum; + + mark_sec_changed(file->elf, sec, true); + + idx++; + } + + return 0; +} +#else +static int create_sym_checksum_section(struct objtool_file *file) { return -EINVAL; } +#endif + /* * Warnings shouldn't be reported for ignored functions. */ @@ -1433,9 +1485,14 @@ static void add_return_call(struct objtool_file *file, struct instruction *insn, } static bool is_first_func_insn(struct objtool_file *file, - struct instruction *insn, struct symbol *sym) + struct instruction *insn) { - if (insn->offset == sym->offset) + struct symbol *func = insn_func(insn); + + if (!func) + return false; + + if (insn->offset == func->offset) return true; /* Allow direct CALL/JMP past ENDBR */ @@ -1443,7 +1500,7 @@ static bool is_first_func_insn(struct objtool_file *file, struct instruction *prev = prev_insn_same_sym(file, insn); if (prev && prev->type == INSN_ENDBR && - insn->offset == sym->offset + prev->len) + insn->offset == func->offset + prev->len) return true; } @@ -1451,44 +1508,22 @@ static bool is_first_func_insn(struct objtool_file *file, } /* - * A sibling call is a tail-call to another symbol -- to differentiate from a - * recursive tail-call which is to the same symbol. - */ -static bool jump_is_sibling_call(struct objtool_file *file, - struct instruction *from, struct instruction *to) -{ - struct symbol *fs = from->sym; - struct symbol *ts = to->sym; - - /* Not a sibling call if from/to a symbol hole */ - if (!fs || !ts) - return false; - - /* Not a sibling call if not targeting the start of a symbol. */ - if (!is_first_func_insn(file, to, ts)) - return false; - - /* Disallow sibling calls into STT_NOTYPE */ - if (ts->type == STT_NOTYPE) - return false; - - /* Must not be self to be a sibling */ - return fs->pfunc != ts->pfunc; -} - -/* * Find the destination instructions for all jumps. */ static int add_jump_destinations(struct objtool_file *file) { - struct instruction *insn, *jump_dest; + struct instruction *insn; struct reloc *reloc; - struct section *dest_sec; - unsigned long dest_off; - int ret; for_each_insn(file, insn) { struct symbol *func = insn_func(insn); + struct instruction *dest_insn; + struct section *dest_sec; + struct symbol *dest_sym; + unsigned long dest_off; + + if (!is_static_jump(insn)) + continue; if (insn->jump_dest) { /* @@ -1497,53 +1532,53 @@ static int add_jump_destinations(struct objtool_file *file) */ continue; } - if (!is_static_jump(insn)) - continue; reloc = insn_reloc(file, insn); if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); - } else if (reloc->sym->type == STT_SECTION) { - dest_sec = reloc->sym->sec; - dest_off = arch_dest_reloc_offset(reloc_addend(reloc)); - } else if (reloc->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; - continue; - } else if (reloc->sym->return_thunk) { - add_return_call(file, insn, true); - continue; - } else if (func) { - /* - * External sibling call or internal sibling call with - * STT_FUNC reloc. - */ - ret = add_call_dest(file, insn, reloc->sym, true); - if (ret) - return ret; - continue; - } else if (reloc->sym->sec->idx) { - dest_sec = reloc->sym->sec; - dest_off = reloc->sym->sym.st_value + - arch_dest_reloc_offset(reloc_addend(reloc)); + dest_sym = dest_sec->sym; } else { - /* non-func asm code jumping to another file */ - continue; + dest_sym = reloc->sym; + if (is_undef_sym(dest_sym)) { + if (dest_sym->retpoline_thunk) { + if (add_retpoline_call(file, insn)) + return -1; + continue; + } + + if (dest_sym->return_thunk) { + add_return_call(file, insn, true); + continue; + } + + /* External symbol */ + if (func) { + /* External sibling call */ + if (add_call_dest(file, insn, dest_sym, true)) + return -1; + continue; + } + + /* Non-func asm code jumping to external symbol */ + continue; + } + + dest_sec = dest_sym->sec; + dest_off = dest_sym->offset + arch_insn_adjusted_addend(insn, reloc); } - jump_dest = find_insn(file, dest_sec, dest_off); - if (!jump_dest) { + dest_insn = find_insn(file, dest_sec, dest_off); + if (!dest_insn) { struct symbol *sym = find_symbol_by_offset(dest_sec, dest_off); /* - * This is a special case for retbleed_untrain_ret(). - * It jumps to __x86_return_thunk(), but objtool - * can't find the thunk's starting RET - * instruction, because the RET is also in the - * middle of another instruction. Objtool only - * knows about the outer instruction. + * retbleed_untrain_ret() jumps to + * __x86_return_thunk(), but objtool can't find + * the thunk's starting RET instruction, + * because the RET is also in the middle of + * another instruction. Objtool only knows + * about the outer instruction. */ if (sym && sym->embedded_insn) { add_return_call(file, insn, false); @@ -1551,76 +1586,52 @@ static int add_jump_destinations(struct objtool_file *file) } /* - * GCOV/KCOV dead code can jump to the end of the - * function/section. + * GCOV/KCOV dead code can jump to the end of + * the function/section. */ if (file->ignore_unreachables && func && dest_sec == insn->sec && dest_off == func->offset + func->len) continue; - ERROR_INSN(insn, "can't find jump dest instruction at %s+0x%lx", - dest_sec->name, dest_off); + ERROR_INSN(insn, "can't find jump dest instruction at %s", + offstr(dest_sec, dest_off)); return -1; } - /* - * An intra-TU jump in retpoline.o might not have a relocation - * for its jump dest, in which case the above - * add_{retpoline,return}_call() didn't happen. - */ - if (jump_dest->sym && jump_dest->offset == jump_dest->sym->offset) { - if (jump_dest->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; - continue; - } - if (jump_dest->sym->return_thunk) { - add_return_call(file, insn, true); - continue; - } + if (!dest_sym || is_sec_sym(dest_sym)) { + dest_sym = dest_insn->sym; + if (!dest_sym) + goto set_jump_dest; } - /* - * Cross-function jump. - */ - if (func && insn_func(jump_dest) && func != insn_func(jump_dest)) { + if (dest_sym->retpoline_thunk && dest_insn->offset == dest_sym->offset) { + if (add_retpoline_call(file, insn)) + return -1; + continue; + } - /* - * For GCC 8+, create parent/child links for any cold - * subfunctions. This is _mostly_ redundant with a - * similar initialization in read_symbols(). - * - * If a function has aliases, we want the *first* such - * function in the symbol table to be the subfunction's - * parent. In that case we overwrite the - * initialization done in read_symbols(). - * - * However this code can't completely replace the - * read_symbols() code because this doesn't detect the - * case where the parent function's only reference to a - * subfunction is through a jump table. - */ - if (!strstr(func->name, ".cold") && - strstr(insn_func(jump_dest)->name, ".cold")) { - func->cfunc = insn_func(jump_dest); - insn_func(jump_dest)->pfunc = func; - } + if (dest_sym->return_thunk && dest_insn->offset == dest_sym->offset) { + add_return_call(file, insn, true); + continue; } - if (jump_is_sibling_call(file, insn, jump_dest)) { - /* - * Internal sibling call without reloc or with - * STT_SECTION reloc. - */ - ret = add_call_dest(file, insn, insn_func(jump_dest), true); - if (ret) - return ret; + if (!insn->sym || insn->sym->pfunc == dest_sym->pfunc) + goto set_jump_dest; + + /* + * Internal cross-function jump. + */ + + if (is_first_func_insn(file, dest_insn)) { + /* Internal sibling call */ + if (add_call_dest(file, insn, dest_sym, true)) + return -1; continue; } - insn->jump_dest = jump_dest; +set_jump_dest: + insn->jump_dest = dest_insn; } return 0; @@ -1646,7 +1657,6 @@ static int add_call_destinations(struct objtool_file *file) unsigned long dest_off; struct symbol *dest; struct reloc *reloc; - int ret; for_each_insn(file, insn) { struct symbol *func = insn_func(insn); @@ -1658,9 +1668,8 @@ static int add_call_destinations(struct objtool_file *file) dest_off = arch_jump_destination(insn); dest = find_call_destination(insn->sec, dest_off); - ret = add_call_dest(file, insn, dest, false); - if (ret) - return ret; + if (add_call_dest(file, insn, dest, false)) + return -1; if (func && func->ignore) continue; @@ -1670,13 +1679,13 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - if (func && insn_call_dest(insn)->type != STT_FUNC) { + if (func && !is_func_sym(insn_call_dest(insn))) { ERROR_INSN(insn, "unsupported call to non-function"); return -1; } - } else if (reloc->sym->type == STT_SECTION) { - dest_off = arch_dest_reloc_offset(reloc_addend(reloc)); + } else if (is_sec_sym(reloc->sym)) { + dest_off = arch_insn_adjusted_addend(insn, reloc); dest = find_call_destination(reloc->sym->sec, dest_off); if (!dest) { ERROR_INSN(insn, "can't find call dest symbol at %s+0x%lx", @@ -1684,19 +1693,16 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - ret = add_call_dest(file, insn, dest, false); - if (ret) - return ret; + if (add_call_dest(file, insn, dest, false)) + return -1; } else if (reloc->sym->retpoline_thunk) { - ret = add_retpoline_call(file, insn); - if (ret) - return ret; + if (add_retpoline_call(file, insn)) + return -1; } else { - ret = add_call_dest(file, insn, reloc->sym, false); - if (ret) - return ret; + if (add_call_dest(file, insn, reloc->sym, false)) + return -1; } } @@ -1745,6 +1751,7 @@ static int handle_group_alt(struct objtool_file *file, orig_alt_group->last_insn = last_orig_insn; orig_alt_group->nop = NULL; orig_alt_group->ignore = orig_insn->ignore_alts; + orig_alt_group->feature = 0; } else { if (orig_alt_group->last_insn->offset + orig_alt_group->last_insn->len - orig_alt_group->first_insn->offset != special_alt->orig_len) { @@ -1784,6 +1791,7 @@ static int handle_group_alt(struct objtool_file *file, nop->type = INSN_NOP; nop->sym = orig_insn->sym; nop->alt_group = new_alt_group; + nop->fake = 1; } if (!special_alt->new_len) { @@ -1848,6 +1856,7 @@ end: new_alt_group->nop = nop; new_alt_group->ignore = (*new_insn)->ignore_alts; new_alt_group->cfi = orig_alt_group->cfi; + new_alt_group->feature = special_alt->feature; return 0; } @@ -1912,8 +1921,9 @@ static int add_special_section_alts(struct objtool_file *file) struct list_head special_alts; struct instruction *orig_insn, *new_insn; struct special_alt *special_alt, *tmp; + enum alternative_type alt_type; struct alternative *alt; - int ret; + struct alternative *a; if (special_get_alts(file->elf, &special_alts)) return -1; @@ -1945,16 +1955,18 @@ static int add_special_section_alts(struct objtool_file *file) continue; } - ret = handle_group_alt(file, special_alt, orig_insn, - &new_insn); - if (ret) - return ret; + if (handle_group_alt(file, special_alt, orig_insn, &new_insn)) + return -1; + + alt_type = ALT_TYPE_INSTRUCTIONS; } else if (special_alt->jump_or_nop) { - ret = handle_jump_alt(file, special_alt, orig_insn, - &new_insn); - if (ret) - return ret; + if (handle_jump_alt(file, special_alt, orig_insn, &new_insn)) + return -1; + + alt_type = ALT_TYPE_JUMP_TABLE; + } else { + alt_type = ALT_TYPE_EX_TABLE; } alt = calloc(1, sizeof(*alt)); @@ -1964,8 +1976,20 @@ static int add_special_section_alts(struct objtool_file *file) } alt->insn = new_insn; - alt->next = orig_insn->alts; - orig_insn->alts = alt; + alt->type = alt_type; + alt->next = NULL; + + /* + * Store alternatives in the same order they have been + * defined. + */ + if (!orig_insn->alts) { + orig_insn->alts = alt; + } else { + for (a = orig_insn->alts; a->next; a = a->next) + ; + a->next = alt; + } list_del(&special_alt->list); free(special_alt); @@ -2142,15 +2166,13 @@ static int add_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn; - int ret; func_for_each_insn(file, func, insn) { if (!insn_jump_table(insn)) continue; - ret = add_jump_table(file, insn); - if (ret) - return ret; + if (add_jump_table(file, insn)) + return -1; } return 0; @@ -2164,19 +2186,17 @@ static int add_func_jump_tables(struct objtool_file *file, static int add_jump_table_alts(struct objtool_file *file) { struct symbol *func; - int ret; if (!file->rodata) return 0; - for_each_sym(file, func) { - if (func->type != STT_FUNC) + for_each_sym(file->elf, func) { + if (!is_func_sym(func) || func->alias != func) continue; mark_func_jump_tables(file, func); - ret = add_func_jump_tables(file, func); - if (ret) - return ret; + if (add_func_jump_tables(file, func)) + return -1; } return 0; @@ -2210,14 +2230,14 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (sec->sh.sh_size % sizeof(struct unwind_hint)) { + if (sec_size(sec) % sizeof(struct unwind_hint)) { ERROR("struct unwind_hint size mismatch"); return -1; } file->hints = true; - for (i = 0; i < sec->sh.sh_size / sizeof(struct unwind_hint); i++) { + for (i = 0; i < sec_size(sec) / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); @@ -2226,14 +2246,7 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - if (reloc->sym->type == STT_SECTION) { - offset = reloc_addend(reloc); - } else if (reloc->sym->local_label) { - offset = reloc->sym->offset; - } else { - ERROR("unexpected relocation symbol type in %s", sec->rsec->name); - return -1; - } + offset = reloc->sym->offset + reloc_addend(reloc); insn = find_insn(file, reloc->sym->sec, offset); if (!insn) { @@ -2262,7 +2275,7 @@ static int read_unwind_hints(struct objtool_file *file) if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); - if (sym && sym->bind == STB_GLOBAL) { + if (sym && is_global_sym(sym)) { if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) { ERROR_INSN(insn, "UNWIND_HINT_IRET_REGS without ENDBR"); return -1; @@ -2300,7 +2313,7 @@ static int read_annotate(struct objtool_file *file, struct instruction *insn; struct reloc *reloc; uint64_t offset; - int type, ret; + int type; sec = find_section_by_name(file->elf, ".discard.annotate_insn"); if (!sec) @@ -2318,10 +2331,13 @@ static int read_annotate(struct objtool_file *file, sec->sh.sh_entsize = 8; } - for_each_reloc(sec->rsec, reloc) { - type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * sec->sh.sh_entsize) + 4); - type = bswap_if_needed(file->elf, type); + if (sec_num_entries(sec) != sec_num_entries(sec->rsec)) { + ERROR("bad .discard.annotate_insn section: missing relocs"); + return -1; + } + for_each_reloc(sec->rsec, reloc) { + type = annotype(file->elf, sec, reloc); offset = reloc->sym->offset + reloc_addend(reloc); insn = find_insn(file, reloc->sym->sec, offset); @@ -2330,9 +2346,8 @@ static int read_annotate(struct objtool_file *file, return -1; } - ret = func(file, type, insn); - if (ret < 0) - return ret; + if (func(file, type, insn)) + return -1; } return 0; @@ -2471,12 +2486,13 @@ static bool is_profiling_func(const char *name) static int classify_symbols(struct objtool_file *file) { struct symbol *func; + size_t len; - for_each_sym(file, func) { - if (func->type == STT_NOTYPE && strstarts(func->name, ".L")) + for_each_sym(file->elf, func) { + if (is_notype_sym(func) && strstarts(func->name, ".L")) func->local_label = true; - if (func->bind != STB_GLOBAL) + if (!is_global_sym(func)) continue; if (!strncmp(func->name, STATIC_CALL_TRAMP_PREFIX_STR, @@ -2497,6 +2513,10 @@ static int classify_symbols(struct objtool_file *file) if (is_profiling_func(func->name)) func->profiling_func = true; + + len = strlen(func->name); + if (len > sym_name_max_len) + sym_name_max_len = len; } return 0; @@ -2517,7 +2537,7 @@ static void mark_rodata(struct objtool_file *file) * * .rodata.str1.* sections are ignored; they don't contain jump tables. */ - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) || !strncmp(sec->name, ".data.rel.ro", 12)) { @@ -2529,78 +2549,115 @@ static void mark_rodata(struct objtool_file *file) file->rodata = found; } +static void mark_holes(struct objtool_file *file) +{ + struct instruction *insn; + bool in_hole = false; + + if (!opts.link) + return; + + /* + * Whole archive runs might encounter dead code from weak symbols. + * This is where the linker will have dropped the weak symbol in + * favour of a regular symbol, but leaves the code in place. + */ + for_each_insn(file, insn) { + if (insn->sym || !find_symbol_hole_containing(insn->sec, insn->offset)) { + in_hole = false; + continue; + } + + /* Skip function padding and pfx code */ + if (!in_hole && insn->type == INSN_NOP) + continue; + + in_hole = true; + insn->hole = 1; + + /* + * If this hole jumps to a .cold function, mark it ignore. + */ + if (insn->jump_dest) { + struct symbol *dest_func = insn_func(insn->jump_dest); + + if (dest_func && dest_func->cold) + dest_func->ignore = true; + } + } +} + +static bool validate_branch_enabled(void) +{ + return opts.stackval || + opts.orc || + opts.uaccess || + opts.checksum; +} + static int decode_sections(struct objtool_file *file) { - int ret; + file->klp = is_livepatch_module(file); mark_rodata(file); - ret = init_pv_ops(file); - if (ret) - return ret; + if (init_pv_ops(file)) + return -1; /* * Must be before add_{jump_call}_destination. */ - ret = classify_symbols(file); - if (ret) - return ret; + if (classify_symbols(file)) + return -1; - ret = decode_instructions(file); - if (ret) - return ret; + if (decode_instructions(file)) + return -1; - ret = add_ignores(file); - if (ret) - return ret; + if (add_ignores(file)) + return -1; add_uaccess_safe(file); - ret = read_annotate(file, __annotate_early); - if (ret) - return ret; + if (read_annotate(file, __annotate_early)) + return -1; /* * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. */ - if (opts.stackval || opts.orc || opts.uaccess || opts.noinstr) { - ret = add_special_section_alts(file); - if (ret) - return ret; + if (validate_branch_enabled() || opts.noinstr || opts.hack_jump_label || opts.disas) { + if (add_special_section_alts(file)) + return -1; } - ret = add_jump_destinations(file); - if (ret) - return ret; + if (add_jump_destinations(file)) + return -1; /* * Must be before add_call_destination(); it changes INSN_CALL to * INSN_JUMP. */ - ret = read_annotate(file, __annotate_ifc); - if (ret) - return ret; + if (read_annotate(file, __annotate_ifc)) + return -1; - ret = add_call_destinations(file); - if (ret) - return ret; + if (add_call_destinations(file)) + return -1; - ret = add_jump_table_alts(file); - if (ret) - return ret; + if (add_jump_table_alts(file)) + return -1; - ret = read_unwind_hints(file); - if (ret) - return ret; + if (read_unwind_hints(file)) + return -1; + + /* Must be after add_jump_destinations() */ + mark_holes(file); /* * Must be after add_call_destinations() such that it can override * dead_end_function() marks. */ - ret = read_annotate(file, __annotate_late); - if (ret) - return ret; + if (read_annotate(file, __annotate_late)) + return -1; return 0; } @@ -3354,7 +3411,7 @@ static bool pv_call_dest(struct objtool_file *file, struct instruction *insn) if (!reloc || strcmp(reloc->sym->name, "pv_ops")) return false; - idx = (arch_dest_reloc_offset(reloc_addend(reloc)) / sizeof(void *)); + idx = arch_insn_adjusted_addend(insn, reloc) / sizeof(void *); if (file->pv_ops[idx].clean) return true; @@ -3520,8 +3577,10 @@ static bool skip_alt_group(struct instruction *insn) return false; /* ANNOTATE_IGNORE_ALTERNATIVE */ - if (insn->alt_group->ignore) + if (insn->alt_group->ignore) { + TRACE_ALT(insn, "alt group ignored"); return true; + } /* * For NOP patched with CLAC/STAC, only follow the latter to avoid @@ -3543,258 +3602,404 @@ static bool skip_alt_group(struct instruction *insn) return alt_insn->type == INSN_CLAC || alt_insn->type == INSN_STAC; } -/* - * Follow the branch starting at the given instruction, and recursively follow - * any other branches (jumps). Meanwhile, track the frame pointer state at - * each instruction and validate all the rules described in - * tools/objtool/Documentation/objtool.txt. - */ -static int validate_branch(struct objtool_file *file, struct symbol *func, - struct instruction *insn, struct insn_state state) +static int checksum_debug_init(struct objtool_file *file) { - struct alternative *alt; - struct instruction *next_insn, *prev_insn = NULL; - struct section *sec; - u8 visited; - int ret; + char *dup, *s; - if (func && func->ignore) + if (!opts.debug_checksum) return 0; - sec = insn->sec; + dup = strdup(opts.debug_checksum); + if (!dup) { + ERROR_GLIBC("strdup"); + return -1; + } - while (1) { - next_insn = next_insn_to_validate(file, insn); + s = dup; + while (*s) { + struct symbol *func; + char *comma; - if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { - /* Ignore KCFI type preambles, which always fall through */ - if (!strncmp(func->name, "__cfi_", 6) || - !strncmp(func->name, "__pfx_", 6) || - !strncmp(func->name, "__pi___cfi_", 11) || - !strncmp(func->name, "__pi___pfx_", 11)) - return 0; + comma = strchr(s, ','); + if (comma) + *comma = '\0'; - if (file->ignore_unreachables) - return 0; + func = find_symbol_by_name(file->elf, s); + if (!func || !is_func_sym(func)) + WARN("--debug-checksum: can't find '%s'", s); + else + func->debug_checksum = 1; - WARN("%s() falls through to next function %s()", - func->name, insn_func(insn)->name); - func->warned = 1; + if (!comma) + break; - return 1; - } + s = comma + 1; + } - visited = VISITED_BRANCH << state.uaccess; - if (insn->visited & VISITED_BRANCH_MASK) { - if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) - return 1; + free(dup); + return 0; +} - if (insn->visited & visited) - return 0; - } else { - nr_insns_visited++; - } +static void checksum_update_insn(struct objtool_file *file, struct symbol *func, + struct instruction *insn) +{ + struct reloc *reloc = insn_reloc(file, insn); + unsigned long offset; + struct symbol *sym; - if (state.noinstr) - state.instr += insn->instr; + if (insn->fake) + return; - if (insn->hint) { - if (insn->restore) { - struct instruction *save_insn, *i; + checksum_update(func, insn, insn->sec->data->d_buf + insn->offset, insn->len); - i = insn; - save_insn = NULL; + if (!reloc) { + struct symbol *call_dest = insn_call_dest(insn); - sym_for_each_insn_continue_reverse(file, func, i) { - if (i->save) { - save_insn = i; - break; - } - } + if (call_dest) + checksum_update(func, insn, call_dest->demangled_name, + strlen(call_dest->demangled_name)); + return; + } - if (!save_insn) { - WARN_INSN(insn, "no corresponding CFI save for CFI restore"); - return 1; + sym = reloc->sym; + offset = arch_insn_adjusted_addend(insn, reloc); + + if (is_string_sec(sym->sec)) { + char *str; + + str = sym->sec->data->d_buf + sym->offset + offset; + checksum_update(func, insn, str, strlen(str)); + return; + } + + if (is_sec_sym(sym)) { + sym = find_symbol_containing(reloc->sym->sec, offset); + if (!sym) + return; + + offset -= sym->offset; + } + + checksum_update(func, insn, sym->demangled_name, strlen(sym->demangled_name)); + checksum_update(func, insn, &offset, sizeof(offset)); +} + +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state); +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state); + +static int validate_insn(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state *statep, + struct instruction *prev_insn, struct instruction *next_insn, + bool *dead_end) +{ + /* prev_state and alt_name are not used if there is no disassembly support */ + struct insn_state prev_state __maybe_unused; + char *alt_name __maybe_unused = NULL; + struct alternative *alt; + u8 visited; + int ret; + + /* + * Any returns before the end of this function are effectively dead + * ends, i.e. validate_branch() has reached the end of the branch. + */ + *dead_end = true; + + visited = VISITED_BRANCH << statep->uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { + if (!insn->hint && !insn_cfi_match(insn, &statep->cfi)) + return 1; + + if (insn->visited & visited) { + TRACE_INSN(insn, "already visited"); + return 0; + } + } else { + nr_insns_visited++; + } + + if (statep->noinstr) + statep->instr += insn->instr; + + if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; + + i = insn; + save_insn = NULL; + + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; } + } - if (!save_insn->visited) { - /* - * If the restore hint insn is at the - * beginning of a basic block and was - * branched to from elsewhere, and the - * save insn hasn't been visited yet, - * defer following this branch for now. - * It will be seen later via the - * straight-line path. - */ - if (!prev_insn) - return 0; + if (!save_insn) { + WARN_INSN(insn, "no corresponding CFI save for CFI restore"); + return 1; + } - WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); - return 1; + if (!save_insn->visited) { + /* + * If the restore hint insn is at the + * beginning of a basic block and was + * branched to from elsewhere, and the + * save insn hasn't been visited yet, + * defer following this branch for now. + * It will be seen later via the + * straight-line path. + */ + if (!prev_insn) { + TRACE_INSN(insn, "defer restore"); + return 0; } - insn->cfi = save_insn->cfi; - nr_cfi_reused++; + WARN_INSN(insn, "objtool isn't smart enough to handle this CFI save/restore combo"); + return 1; } - state.cfi = *insn->cfi; + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + + statep->cfi = *insn->cfi; + } else { + /* XXX track if we actually changed statep->cfi */ + + if (prev_insn && !cficmp(prev_insn->cfi, &statep->cfi)) { + insn->cfi = prev_insn->cfi; + nr_cfi_reused++; } else { - /* XXX track if we actually changed state.cfi */ + insn->cfi = cfi_hash_find_or_add(&statep->cfi); + } + } - if (prev_insn && !cficmp(prev_insn->cfi, &state.cfi)) { - insn->cfi = prev_insn->cfi; - nr_cfi_reused++; - } else { - insn->cfi = cfi_hash_find_or_add(&state.cfi); + insn->visited |= visited; + + if (propagate_alt_cfi(file, insn)) + return 1; + + if (insn->alts) { + for (alt = insn->alts; alt; alt = alt->next) { + TRACE_ALT_BEGIN(insn, alt, alt_name); + ret = validate_branch(file, func, alt->insn, *statep); + TRACE_ALT_END(insn, alt, alt_name); + if (ret) { + BT_INSN(insn, "(alt)"); + return ret; } } + TRACE_ALT_INFO_NOADDR(insn, "/ ", "DEFAULT"); + } - insn->visited |= visited; + if (skip_alt_group(insn)) + return 0; + + prev_state = *statep; + ret = handle_insn_ops(insn, next_insn, statep); + TRACE_INSN_STATE(insn, &prev_state, statep); + + if (ret) + return 1; + + switch (insn->type) { + + case INSN_RETURN: + TRACE_INSN(insn, "return"); + return validate_return(func, insn, statep); + + case INSN_CALL: + case INSN_CALL_DYNAMIC: + if (insn->type == INSN_CALL) + TRACE_INSN(insn, "call"); + else + TRACE_INSN(insn, "indirect call"); + + ret = validate_call(file, insn, statep); + if (ret) + return ret; - if (propagate_alt_cfi(file, insn)) + if (opts.stackval && func && !is_special_call(insn) && + !has_valid_stack_frame(statep)) { + WARN_INSN(insn, "call without frame pointer save/setup"); return 1; + } - if (insn->alts) { - for (alt = insn->alts; alt; alt = alt->next) { - ret = validate_branch(file, func, alt->insn, state); - if (ret) { - BT_INSN(insn, "(alt)"); - return ret; - } + break; + + case INSN_JUMP_CONDITIONAL: + case INSN_JUMP_UNCONDITIONAL: + if (is_sibling_call(insn)) { + TRACE_INSN(insn, "sibling call"); + ret = validate_sibling_call(file, insn, statep); + if (ret) + return ret; + + } else if (insn->jump_dest) { + if (insn->type == INSN_JUMP_UNCONDITIONAL) + TRACE_INSN(insn, "unconditional jump"); + else + TRACE_INSN(insn, "jump taken"); + + ret = validate_branch(file, func, insn->jump_dest, *statep); + if (ret) { + BT_INSN(insn, "(branch)"); + return ret; } } - if (skip_alt_group(insn)) + if (insn->type == INSN_JUMP_UNCONDITIONAL) return 0; - if (handle_insn_ops(insn, next_insn, &state)) - return 1; - - switch (insn->type) { - - case INSN_RETURN: - return validate_return(func, insn, &state); + TRACE_INSN(insn, "jump not taken"); + break; - case INSN_CALL: - case INSN_CALL_DYNAMIC: - ret = validate_call(file, insn, &state); + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + TRACE_INSN(insn, "indirect jump"); + if (is_sibling_call(insn)) { + ret = validate_sibling_call(file, insn, statep); if (ret) return ret; + } - if (opts.stackval && func && !is_special_call(insn) && - !has_valid_stack_frame(&state)) { - WARN_INSN(insn, "call without frame pointer save/setup"); - return 1; - } + if (insn->type == INSN_JUMP_DYNAMIC) + return 0; - break; + break; - case INSN_JUMP_CONDITIONAL: - case INSN_JUMP_UNCONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; + case INSN_SYSCALL: + TRACE_INSN(insn, "syscall"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } - } else if (insn->jump_dest) { - ret = validate_branch(file, func, - insn->jump_dest, state); - if (ret) { - BT_INSN(insn, "(branch)"); - return ret; - } - } + break; - if (insn->type == INSN_JUMP_UNCONDITIONAL) - return 0; + case INSN_SYSRET: + TRACE_INSN(insn, "sysret"); + if (func && (!next_insn || !next_insn->hint)) { + WARN_INSN(insn, "unsupported instruction in callable function"); + return 1; + } + return 0; + + case INSN_STAC: + TRACE_INSN(insn, "stac"); + if (!opts.uaccess) break; - case INSN_JUMP_DYNAMIC: - case INSN_JUMP_DYNAMIC_CONDITIONAL: - if (is_sibling_call(insn)) { - ret = validate_sibling_call(file, insn, &state); - if (ret) - return ret; - } + if (statep->uaccess) { + WARN_INSN(insn, "recursive UACCESS enable"); + return 1; + } - if (insn->type == INSN_JUMP_DYNAMIC) - return 0; + statep->uaccess = true; + break; + case INSN_CLAC: + TRACE_INSN(insn, "clac"); + if (!opts.uaccess) break; - case INSN_SYSCALL: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + if (!statep->uaccess && func) { + WARN_INSN(insn, "redundant UACCESS disable"); + return 1; + } - break; + if (func_uaccess_safe(func) && !statep->uaccess_stack) { + WARN_INSN(insn, "UACCESS-safe disables UACCESS"); + return 1; + } - case INSN_SYSRET: - if (func && (!next_insn || !next_insn->hint)) { - WARN_INSN(insn, "unsupported instruction in callable function"); - return 1; - } + statep->uaccess = false; + break; - return 0; + case INSN_STD: + TRACE_INSN(insn, "std"); + if (statep->df) { + WARN_INSN(insn, "recursive STD"); + return 1; + } - case INSN_STAC: - if (!opts.uaccess) - break; + statep->df = true; + break; - if (state.uaccess) { - WARN_INSN(insn, "recursive UACCESS enable"); - return 1; - } + case INSN_CLD: + TRACE_INSN(insn, "cld"); + if (!statep->df && func) { + WARN_INSN(insn, "redundant CLD"); + return 1; + } - state.uaccess = true; - break; + statep->df = false; + break; - case INSN_CLAC: - if (!opts.uaccess) - break; + default: + break; + } - if (!state.uaccess && func) { - WARN_INSN(insn, "redundant UACCESS disable"); - return 1; - } + if (insn->dead_end) + TRACE_INSN(insn, "dead end"); - if (func_uaccess_safe(func) && !state.uaccess_stack) { - WARN_INSN(insn, "UACCESS-safe disables UACCESS"); - return 1; - } + *dead_end = insn->dead_end; + return 0; +} - state.uaccess = false; - break; +/* + * Follow the branch starting at the given instruction, and recursively follow + * any other branches (jumps). Meanwhile, track the frame pointer state at + * each instruction and validate all the rules described in + * tools/objtool/Documentation/objtool.txt. + */ +static int do_validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + struct instruction *next_insn, *prev_insn = NULL; + bool dead_end; + int ret; - case INSN_STD: - if (state.df) { - WARN_INSN(insn, "recursive STD"); - return 1; - } + if (func && func->ignore) + return 0; - state.df = true; - break; + do { + insn->trace = 0; + next_insn = next_insn_to_validate(file, insn); - case INSN_CLD: - if (!state.df && func) { - WARN_INSN(insn, "redundant CLD"); - return 1; - } + if (opts.checksum && func && insn->sec) + checksum_update_insn(file, func, insn); - state.df = false; - break; + if (func && insn_func(insn) && func != insn_func(insn)->pfunc) { + /* Ignore KCFI type preambles, which always fall through */ + if (is_prefix_func(func)) + return 0; - default: - break; + if (file->ignore_unreachables) + return 0; + + WARN("%s() falls through to next function %s()", + func->name, insn_func(insn)->name); + func->warned = 1; + + return 1; } - if (insn->dead_end) - return 0; + ret = validate_insn(file, func, insn, &state, prev_insn, next_insn, + &dead_end); + + if (!insn->trace) { + if (ret) + TRACE_INSN(insn, "warning (%d)", ret); + else + TRACE_INSN(insn, NULL); + } - if (!next_insn) { + if (!dead_end && !next_insn) { if (state.cfi.cfa.base == CFI_UNDEFINED) return 0; if (file->ignore_unreachables) @@ -3802,15 +4007,28 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, WARN("%s%sunexpected end of section %s", func ? func->name : "", func ? "(): " : "", - sec->name); + insn->sec->name); return 1; } prev_insn = insn; insn = next_insn; - } - return 0; + } while (!dead_end); + + return ret; +} + +static int validate_branch(struct objtool_file *file, struct symbol *func, + struct instruction *insn, struct insn_state state) +{ + int ret; + + trace_depth_inc(); + ret = do_validate_branch(file, func, insn, state); + trace_depth_dec(); + + return ret; } static int validate_unwind_hint(struct objtool_file *file, @@ -3818,7 +4036,13 @@ static int validate_unwind_hint(struct objtool_file *file, struct insn_state *state) { if (insn->hint && !insn->visited) { - int ret = validate_branch(file, insn_func(insn), insn, *state); + struct symbol *func = insn_func(insn); + int ret; + + if (opts.checksum) + checksum_init(func); + + ret = validate_branch(file, func, insn, *state); if (ret) BT_INSN(insn, "<=== (hint)"); return ret; @@ -4062,7 +4286,8 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio struct instruction *prev_insn; int i; - if (insn->type == INSN_NOP || insn->type == INSN_TRAP || (func && func->ignore)) + if (insn->type == INSN_NOP || insn->type == INSN_TRAP || + insn->hole || (func && func->ignore)) return true; /* @@ -4073,47 +4298,6 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio !strcmp(insn->sec->name, ".altinstr_aux")) return true; - /* - * Whole archive runs might encounter dead code from weak symbols. - * This is where the linker will have dropped the weak symbol in - * favour of a regular symbol, but leaves the code in place. - * - * In this case we'll find a piece of code (whole function) that is not - * covered by a !section symbol. Ignore them. - */ - if (opts.link && !func) { - int size = find_symbol_hole_containing(insn->sec, insn->offset); - unsigned long end = insn->offset + size; - - if (!size) /* not a hole */ - return false; - - if (size < 0) /* hole until the end */ - return true; - - sec_for_each_insn_continue(file, insn) { - /* - * If we reach a visited instruction at or before the - * end of the hole, ignore the unreachable. - */ - if (insn->visited) - return true; - - if (insn->offset >= end) - break; - - /* - * If this hole jumps to a .cold function, mark it ignore too. - */ - if (insn->jump_dest && insn_func(insn->jump_dest) && - strstr(insn_func(insn->jump_dest)->name, ".cold")) { - insn_func(insn->jump_dest)->ignore = true; - } - } - - return false; - } - if (!func) return false; @@ -4165,14 +4349,54 @@ static bool ignore_unreachable_insn(struct objtool_file *file, struct instructio return false; } -static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) +/* + * For FineIBT or kCFI, a certain number of bytes preceding the function may be + * NOPs. Those NOPs may be rewritten at runtime and executed, so give them a + * proper function name: __pfx_<func>. + * + * The NOPs may not exist for the following cases: + * + * - compiler cloned functions (*.cold, *.part0, etc) + * - asm functions created with inline asm or without SYM_FUNC_START() + * + * Also, the function may already have a prefix from a previous objtool run + * (livepatch extracted functions, or manually running objtool multiple times). + * + * So return 0 if the NOPs are missing or the function already has a prefix + * symbol. + */ +static int create_prefix_symbol(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *prev; + char name[SYM_NAME_LEN]; struct cfi_state *cfi; + if (!is_func_sym(func) || is_prefix_func(func) || + func->cold || func->static_call_tramp) + return 0; + + if ((strlen(func->name) + sizeof("__pfx_") > SYM_NAME_LEN)) { + WARN("%s: symbol name too long, can't create __pfx_ symbol", + func->name); + return 0; + } + + if (snprintf_check(name, SYM_NAME_LEN, "__pfx_%s", func->name)) + return -1; + + if (file->klp) { + struct symbol *pfx; + + pfx = find_symbol_by_offset(func->sec, func->offset - opts.prefix); + if (pfx && is_prefix_func(pfx) && !strcmp(pfx->name, name)) + return 0; + } + insn = find_insn(file, func->sec, func->offset); - if (!insn) + if (!insn) { + WARN("%s: can't find starting instruction", func->name); return -1; + } for (prev = prev_insn_same_sec(file, insn); prev; @@ -4180,22 +4404,27 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) u64 offset; if (prev->type != INSN_NOP) - return -1; + return 0; offset = func->offset - prev->offset; if (offset > opts.prefix) - return -1; + return 0; if (offset < opts.prefix) continue; - elf_create_prefix_symbol(file->elf, func, opts.prefix); + if (!elf_create_symbol(file->elf, name, func->sec, + GELF_ST_BIND(func->sym.st_info), + GELF_ST_TYPE(func->sym.st_info), + prev->offset, opts.prefix)) + return -1; + break; } if (!prev) - return -1; + return 0; if (!insn->cfi) { /* @@ -4213,20 +4442,18 @@ static int add_prefix_symbol(struct objtool_file *file, struct symbol *func) return 0; } -static int add_prefix_symbols(struct objtool_file *file) +static int create_prefix_symbols(struct objtool_file *file) { struct section *sec; struct symbol *func; - for_each_sec(file, sec) { - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + for_each_sec(file->elf, sec) { + if (!is_text_sec(sec)) continue; sec_for_each_sym(sec, func) { - if (func->type != STT_FUNC) - continue; - - add_prefix_symbol(file, func); + if (create_prefix_symbol(file, func)) + return -1; } } @@ -4237,6 +4464,7 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, struct symbol *sym, struct insn_state *state) { struct instruction *insn; + struct symbol *func; int ret; if (!sym->len) { @@ -4254,9 +4482,26 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (opts.uaccess) state->uaccess = sym->uaccess_safe; - ret = validate_branch(file, insn_func(insn), insn, *state); + func = insn_func(insn); + + if (opts.checksum) + checksum_init(func); + + if (opts.trace && !fnmatch(opts.trace, sym->name, 0)) { + trace_enable(); + TRACE("%s: validation begin\n", sym->name); + } + + ret = validate_branch(file, func, insn, *state); if (ret) BT_INSN(insn, "<=== (sym)"); + + TRACE("%s: validation %s\n\n", sym->name, ret ? "failed" : "end"); + trace_disable(); + + if (opts.checksum) + checksum_finish(func); + return ret; } @@ -4267,7 +4512,7 @@ static int validate_section(struct objtool_file *file, struct section *sec) int warnings = 0; sec_for_each_sym(sec, func) { - if (func->type != STT_FUNC) + if (!is_func_sym(func)) continue; init_insn_state(file, &state, sec); @@ -4310,8 +4555,8 @@ static int validate_functions(struct objtool_file *file) struct section *sec; int warnings = 0; - for_each_sec(file, sec) { - if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + for_each_sec(file->elf, sec) { + if (!is_text_sec(sec)) continue; warnings += validate_section(file, sec); @@ -4438,12 +4683,7 @@ static int validate_ibt_insn(struct objtool_file *file, struct instruction *insn reloc_offset(reloc) + 1, (insn->offset + insn->len) - (reloc_offset(reloc) + 1))) { - off = reloc->sym->offset; - if (reloc_type(reloc) == R_X86_64_PC32 || - reloc_type(reloc) == R_X86_64_PLT32) - off += arch_dest_reloc_offset(reloc_addend(reloc)); - else - off += reloc_addend(reloc); + off = reloc->sym->offset + arch_insn_adjusted_addend(insn, reloc); dest = find_insn(file, reloc->sym->sec, off); if (!dest) @@ -4494,10 +4734,10 @@ static int validate_ibt(struct objtool_file *file) for_each_insn(file, insn) warnings += validate_ibt_insn(file, insn); - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { /* Already done by validate_ibt_insn() */ - if (sec->sh.sh_flags & SHF_EXECINSTR) + if (is_text_sec(sec)) continue; if (!sec->rsec) @@ -4512,8 +4752,8 @@ static int validate_ibt(struct objtool_file *file) !strncmp(sec->name, ".debug", 6) || !strcmp(sec->name, ".altinstructions") || !strcmp(sec->name, ".ibt_endbr_seal") || + !strcmp(sec->name, ".kcfi_traps") || !strcmp(sec->name, ".orc_unwind_ip") || - !strcmp(sec->name, ".parainstructions") || !strcmp(sec->name, ".retpoline_sites") || !strcmp(sec->name, ".smp_locks") || !strcmp(sec->name, ".static_call_sites") || @@ -4522,12 +4762,14 @@ static int validate_ibt(struct objtool_file *file) !strcmp(sec->name, "__bug_table") || !strcmp(sec->name, "__ex_table") || !strcmp(sec->name, "__jump_table") || + !strcmp(sec->name, "__klp_funcs") || !strcmp(sec->name, "__mcount_loc") || - !strcmp(sec->name, ".kcfi_traps") || !strcmp(sec->name, ".llvm.call-graph-profile") || !strcmp(sec->name, ".llvm_bb_addr_map") || !strcmp(sec->name, "__tracepoints") || - strstr(sec->name, "__patchable_function_entries")) + !strcmp(sec->name, ".return_sites") || + !strcmp(sec->name, ".call_sites") || + !strcmp(sec->name, "__patchable_function_entries")) continue; for_each_reloc(sec->rsec, reloc) @@ -4601,87 +4843,6 @@ static int validate_reachable_instructions(struct objtool_file *file) return warnings; } -/* 'funcs' is a space-separated list of function names */ -static void disas_funcs(const char *funcs) -{ - const char *objdump_str, *cross_compile; - int size, ret; - char *cmd; - - cross_compile = getenv("CROSS_COMPILE"); - if (!cross_compile) - cross_compile = ""; - - objdump_str = "%sobjdump -wdr %s | gawk -M -v _funcs='%s' '" - "BEGIN { split(_funcs, funcs); }" - "/^$/ { func_match = 0; }" - "/<.*>:/ { " - "f = gensub(/.*<(.*)>:/, \"\\\\1\", 1);" - "for (i in funcs) {" - "if (funcs[i] == f) {" - "func_match = 1;" - "base = strtonum(\"0x\" $1);" - "break;" - "}" - "}" - "}" - "{" - "if (func_match) {" - "addr = strtonum(\"0x\" $1);" - "printf(\"%%04x \", addr - base);" - "print;" - "}" - "}' 1>&2"; - - /* fake snprintf() to calculate the size */ - size = snprintf(NULL, 0, objdump_str, cross_compile, objname, funcs) + 1; - if (size <= 0) { - WARN("objdump string size calculation failed"); - return; - } - - cmd = malloc(size); - - /* real snprintf() */ - snprintf(cmd, size, objdump_str, cross_compile, objname, funcs); - ret = system(cmd); - if (ret) { - WARN("disassembly failed: %d", ret); - return; - } -} - -static void disas_warned_funcs(struct objtool_file *file) -{ - struct symbol *sym; - char *funcs = NULL, *tmp; - - for_each_sym(file, sym) { - if (sym->warned) { - if (!funcs) { - funcs = malloc(strlen(sym->name) + 1); - if (!funcs) { - ERROR_GLIBC("malloc"); - return; - } - strcpy(funcs, sym->name); - } else { - tmp = malloc(strlen(funcs) + strlen(sym->name) + 2); - if (!tmp) { - ERROR_GLIBC("malloc"); - return; - } - sprintf(tmp, "%s %s", funcs, sym->name); - free(funcs); - funcs = tmp; - } - } - } - - if (funcs) - disas_funcs(funcs); -} - __weak bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc) { unsigned int type = reloc_type(reloc); @@ -4696,7 +4857,7 @@ static int check_abs_references(struct objtool_file *file) struct reloc *reloc; int ret = 0; - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { /* absolute references in non-loadable sections are fine */ if (!(sec->sh.sh_flags & SHF_ALLOC)) continue; @@ -4751,10 +4912,35 @@ static void free_insns(struct objtool_file *file) free(chunk->addr); } +const char *objtool_disas_insn(struct instruction *insn) +{ + struct disas_context *dctx = objtool_disas_ctx; + + if (!dctx) + return ""; + + disas_insn(dctx, insn); + return disas_result(dctx); +} + int check(struct objtool_file *file) { + struct disas_context *disas_ctx = NULL; int ret = 0, warnings = 0; + /* + * Create a disassembly context if we might disassemble any + * instruction or function. + */ + if (opts.verbose || opts.backtrace || opts.trace || opts.disas) { + disas_ctx = disas_context_create(file); + if (!disas_ctx) { + opts.disas = false; + opts.trace = false; + } + objtool_disas_ctx = disas_ctx; + } + arch_initial_func_cfi_state(&initial_func_cfi); init_cfi_state(&init_cfi); init_cfi_state(&func_cfi); @@ -4770,6 +4956,10 @@ int check(struct objtool_file *file) cfi_hash_add(&init_cfi); cfi_hash_add(&func_cfi); + ret = checksum_debug_init(file); + if (ret) + goto out; + ret = decode_sections(file); if (ret) goto out; @@ -4780,7 +4970,7 @@ int check(struct objtool_file *file) if (opts.retpoline) warnings += validate_retpoline(file); - if (opts.stackval || opts.orc || opts.uaccess) { + if (validate_branch_enabled()) { int w = 0; w += validate_functions(file); @@ -4845,7 +5035,7 @@ int check(struct objtool_file *file) } if (opts.prefix) { - ret = add_prefix_symbols(file); + ret = create_prefix_symbols(file); if (ret) goto out; } @@ -4859,14 +5049,18 @@ int check(struct objtool_file *file) if (opts.noabs) warnings += check_abs_references(file); + if (opts.checksum) { + ret = create_sym_checksum_section(file); + if (ret) + goto out; + } + if (opts.orc && nr_insns) { ret = orc_create(file); if (ret) goto out; } - free_insns(file); - if (opts.stats) { printf("nr_insns_visited: %ld\n", nr_insns_visited); printf("nr_cfi: %ld\n", nr_cfi); @@ -4875,18 +5069,32 @@ int check(struct objtool_file *file) } out: - if (!ret && !warnings) - return 0; + if (ret || warnings) { + if (opts.werror && warnings) + ret = 1; + + if (opts.verbose) { + if (opts.werror && warnings) + WARN("%d warning(s) upgraded to errors", warnings); + disas_warned_funcs(disas_ctx); + } + } - if (opts.werror && warnings) - ret = 1; + if (opts.disas) + disas_funcs(disas_ctx); - if (opts.verbose) { - if (opts.werror && warnings) - WARN("%d warning(s) upgraded to errors", warnings); - print_args(); - disas_warned_funcs(file); + if (disas_ctx) { + disas_context_destroy(disas_ctx); + objtool_disas_ctx = NULL; } + free_insns(file); + + if (!ret && !warnings) + return 0; + + if (opts.backup && make_backup()) + return 1; + return ret; } diff --git a/tools/objtool/disas.c b/tools/objtool/disas.c new file mode 100644 index 000000000000..2b5059f55e40 --- /dev/null +++ b/tools/objtool/disas.c @@ -0,0 +1,1248 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2015-2017 Josh Poimboeuf <jpoimboe@redhat.com> + */ + +#define _GNU_SOURCE +#include <fnmatch.h> + +#include <objtool/arch.h> +#include <objtool/check.h> +#include <objtool/disas.h> +#include <objtool/special.h> +#include <objtool/warn.h> + +#include <bfd.h> +#include <linux/string.h> +#include <tools/dis-asm-compat.h> + +/* + * Size of the buffer for storing the result of disassembling + * a single instruction. + */ +#define DISAS_RESULT_SIZE 1024 + +struct disas_context { + struct objtool_file *file; + struct instruction *insn; + bool alt_applied; + char result[DISAS_RESULT_SIZE]; + disassembler_ftype disassembler; + struct disassemble_info info; +}; + +/* + * Maximum number of alternatives + */ +#define DISAS_ALT_MAX 5 + +/* + * Maximum number of instructions per alternative + */ +#define DISAS_ALT_INSN_MAX 50 + +/* + * Information to disassemble an alternative + */ +struct disas_alt { + struct instruction *orig_insn; /* original instruction */ + struct alternative *alt; /* alternative or NULL if default code */ + char *name; /* name for this alternative */ + int width; /* formatting width */ + struct { + char *str; /* instruction string */ + int offset; /* instruction offset */ + int nops; /* number of nops */ + } insn[DISAS_ALT_INSN_MAX]; /* alternative instructions */ + int insn_idx; /* index of the next instruction to print */ +}; + +#define DALT_DEFAULT(dalt) (!(dalt)->alt) +#define DALT_INSN(dalt) (DALT_DEFAULT(dalt) ? (dalt)->orig_insn : (dalt)->alt->insn) +#define DALT_GROUP(dalt) (DALT_INSN(dalt)->alt_group) +#define DALT_ALTID(dalt) ((dalt)->orig_insn->offset) + +#define ALT_FLAGS_SHIFT 16 +#define ALT_FLAG_NOT (1 << 0) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_FEATURE_MASK ((1 << ALT_FLAGS_SHIFT) - 1) + +static int alt_feature(unsigned int ft_flags) +{ + return (ft_flags & ALT_FEATURE_MASK); +} + +static int alt_flags(unsigned int ft_flags) +{ + return (ft_flags >> ALT_FLAGS_SHIFT); +} + +/* + * Wrapper around asprintf() to allocate and format a string. + * Return the allocated string or NULL on error. + */ +static char *strfmt(const char *fmt, ...) +{ + va_list ap; + char *str; + int rv; + + va_start(ap, fmt); + rv = vasprintf(&str, fmt, ap); + va_end(ap); + + return rv == -1 ? NULL : str; +} + +static int sprint_name(char *str, const char *name, unsigned long offset) +{ + int len; + + if (offset) + len = sprintf(str, "%s+0x%lx", name, offset); + else + len = sprintf(str, "%s", name); + + return len; +} + +#define DINFO_FPRINTF(dinfo, ...) \ + ((*(dinfo)->fprintf_func)((dinfo)->stream, __VA_ARGS__)) + +static int disas_result_fprintf(struct disas_context *dctx, + const char *fmt, va_list ap) +{ + char *buf = dctx->result; + int avail, len; + + len = strlen(buf); + if (len >= DISAS_RESULT_SIZE - 1) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is full"); + return -1; + } + avail = DISAS_RESULT_SIZE - len; + + len = vsnprintf(buf + len, avail, fmt, ap); + if (len < 0 || len >= avail) { + WARN_FUNC(dctx->insn->sec, dctx->insn->offset, + "disassembly buffer is truncated"); + return -1; + } + + return 0; +} + +static int disas_fprintf(void *stream, const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +/* + * For init_disassemble_info_compat(). + */ +static int disas_fprintf_styled(void *stream, + enum disassembler_style style, + const char *fmt, ...) +{ + va_list arg; + int rv; + + va_start(arg, fmt); + rv = disas_result_fprintf(stream, fmt, arg); + va_end(arg); + + return rv; +} + +static void disas_print_addr_sym(struct section *sec, struct symbol *sym, + bfd_vma addr, struct disassemble_info *dinfo) +{ + char symstr[1024]; + char *str; + + if (sym) { + sprint_name(symstr, sym->name, addr - sym->offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } else { + str = offstr(sec, addr); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } +} + +static bool disas_print_addr_alt(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *orig_first_insn; + struct alt_group *alt_group; + unsigned long offset; + struct symbol *sym; + + /* + * Check if we are processing an alternative at the original + * instruction address (i.e. if alt_applied is true) and if + * we are referencing an address inside the alternative. + * + * For example, this happens if there is a branch inside an + * alternative. In that case, the address should be updated + * to a reference inside the original instruction flow. + */ + if (!dctx->alt_applied) + return false; + + alt_group = dctx->insn->alt_group; + if (!alt_group || !alt_group->orig_group || + addr < alt_group->first_insn->offset || + addr > alt_group->last_insn->offset) + return false; + + orig_first_insn = alt_group->orig_group->first_insn; + offset = addr - alt_group->first_insn->offset; + + addr = orig_first_insn->offset + offset; + sym = orig_first_insn->sym; + + disas_print_addr_sym(orig_first_insn->sec, sym, addr, dinfo); + + return true; +} + +static void disas_print_addr_noreloc(bfd_vma addr, + struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct symbol *sym = NULL; + + if (disas_print_addr_alt(addr, dinfo)) + return; + + if (insn->sym && addr >= insn->sym->offset && + addr < insn->sym->offset + insn->sym->len) { + sym = insn->sym; + } + + disas_print_addr_sym(insn->sec, sym, addr, dinfo); +} + +static void disas_print_addr_reloc(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + unsigned long offset; + struct reloc *reloc; + char symstr[1024]; + char *str; + + reloc = find_reloc_by_dest_range(dctx->file->elf, insn->sec, + insn->offset, insn->len); + if (!reloc) { + /* + * There is no relocation for this instruction although + * the address to resolve points to the next instruction. + * So this is an effective reference to the next IP, for + * example: "lea 0x0(%rip),%rdi". The kernel can reference + * the next IP with _THIS_IP_ macro. + */ + DINFO_FPRINTF(dinfo, "0x%lx <_THIS_IP_>", addr); + return; + } + + offset = arch_insn_adjusted_addend(insn, reloc); + + /* + * If the relocation symbol is a section name (for example ".bss") + * then we try to further resolve the name. + */ + if (reloc->sym->type == STT_SECTION) { + str = offstr(reloc->sym->sec, reloc->sym->offset + offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, str); + free(str); + } else { + sprint_name(symstr, reloc->sym->name, offset); + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, symstr); + } +} + +/* + * Resolve an address into a "<symbol>+<offset>" string. + */ +static void disas_print_address(bfd_vma addr, struct disassemble_info *dinfo) +{ + struct disas_context *dctx = dinfo->application_data; + struct instruction *insn = dctx->insn; + struct instruction *jump_dest; + struct symbol *sym; + bool is_reloc; + + /* + * If the instruction is a call/jump and it references a + * destination then this is likely the address we are looking + * up. So check it first. + */ + jump_dest = insn->jump_dest; + if (jump_dest && jump_dest->sym && jump_dest->offset == addr) { + if (!disas_print_addr_alt(addr, dinfo)) + disas_print_addr_sym(jump_dest->sec, jump_dest->sym, + addr, dinfo); + return; + } + + /* + * If the address points to the next instruction then there is + * probably a relocation. It can be a false positive when the + * current instruction is referencing the address of the next + * instruction. This particular case will be handled in + * disas_print_addr_reloc(). + */ + is_reloc = (addr == insn->offset + insn->len); + + /* + * The call destination offset can be the address we are looking + * up, or 0 if there is a relocation. + */ + sym = insn_call_dest(insn); + if (sym && (sym->offset == addr || (sym->offset == 0 && is_reloc))) { + DINFO_FPRINTF(dinfo, "0x%lx <%s>", addr, sym->name); + return; + } + + if (!is_reloc) + disas_print_addr_noreloc(addr, dinfo); + else + disas_print_addr_reloc(addr, dinfo); +} + +/* + * Initialize disassemble info arch, mach (32 or 64-bit) and options. + */ +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + struct disas_context *dctx = dinfo->application_data; + struct objtool_file *file = dctx->file; + + dinfo->arch = arch; + + switch (file->elf->ehdr.e_ident[EI_CLASS]) { + case ELFCLASS32: + dinfo->mach = mach32; + break; + case ELFCLASS64: + dinfo->mach = mach64; + break; + default: + return -1; + } + + dinfo->disassembler_options = options; + + return 0; +} + +struct disas_context *disas_context_create(struct objtool_file *file) +{ + struct disas_context *dctx; + struct disassemble_info *dinfo; + int err; + + dctx = malloc(sizeof(*dctx)); + if (!dctx) { + WARN("failed to allocate disassembly context"); + return NULL; + } + + dctx->file = file; + dinfo = &dctx->info; + + init_disassemble_info_compat(dinfo, dctx, + disas_fprintf, disas_fprintf_styled); + + dinfo->read_memory_func = buffer_read_memory; + dinfo->print_address_func = disas_print_address; + dinfo->application_data = dctx; + + /* + * bfd_openr() is not used to avoid doing ELF data processing + * and caching that has already being done. Here, we just need + * to identify the target file so we call an arch specific + * function to fill some disassemble info (arch, mach). + */ + + dinfo->arch = bfd_arch_unknown; + dinfo->mach = 0; + + err = arch_disas_info_init(dinfo); + if (err || dinfo->arch == bfd_arch_unknown || dinfo->mach == 0) { + WARN("failed to init disassembly arch"); + goto error; + } + + dinfo->endian = (file->elf->ehdr.e_ident[EI_DATA] == ELFDATA2MSB) ? + BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE; + + disassemble_init_for_target(dinfo); + + dctx->disassembler = disassembler(dinfo->arch, + dinfo->endian == BFD_ENDIAN_BIG, + dinfo->mach, NULL); + if (!dctx->disassembler) { + WARN("failed to create disassembler function"); + goto error; + } + + return dctx; + +error: + free(dctx); + return NULL; +} + +void disas_context_destroy(struct disas_context *dctx) +{ + free(dctx); +} + +char *disas_result(struct disas_context *dctx) +{ + return dctx->result; +} + +#define DISAS_INSN_OFFSET_SPACE 10 +#define DISAS_INSN_SPACE 60 + +#define DISAS_PRINSN(dctx, insn, depth) \ + disas_print_insn(stdout, dctx, insn, depth, "\n") + +/* + * Print a message in the instruction flow. If sec is not NULL then the + * address at the section offset is printed in addition of the message, + * otherwise only the message is printed. + */ +static int disas_vprint(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, va_list ap) +{ + const char *addr_str; + int i, n; + int len; + + len = sym_name_max_len + DISAS_INSN_OFFSET_SPACE; + if (depth < 0) { + len += depth; + depth = 0; + } + + n = 0; + + if (sec) { + addr_str = offstr(sec, offset); + n += fprintf(stream, "%6lx: %-*s ", offset, len, addr_str); + free((char *)addr_str); + } else { + len += DISAS_INSN_OFFSET_SPACE + 1; + n += fprintf(stream, "%-*s", len, ""); + } + + /* print vertical bars to show the code flow */ + for (i = 0; i < depth; i++) + n += fprintf(stream, "| "); + + if (format) + n += vfprintf(stream, format, ap); + + return n; +} + +static int disas_print(FILE *stream, struct section *sec, unsigned long offset, + int depth, const char *format, ...) +{ + va_list args; + int len; + + va_start(args, format); + len = disas_vprint(stream, sec, offset, depth, format, args); + va_end(args); + + return len; +} + +/* + * Print a message in the instruction flow. If insn is not NULL then + * the instruction address is printed in addition of the message, + * otherwise only the message is printed. In all cases, the instruction + * itself is not printed. + */ +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...) +{ + struct section *sec; + unsigned long off; + va_list args; + + if (insn) { + sec = insn->sec; + off = insn->offset; + } else { + sec = NULL; + off = 0; + } + + va_start(args, format); + disas_vprint(stream, sec, off, depth, format, args); + va_end(args); +} + +/* + * Print an instruction address (offset and function), the instruction itself + * and an optional message. + */ +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) +{ + char fake_nop_insn[32]; + const char *insn_str; + bool fake_nop; + va_list args; + int len; + + /* + * Alternative can insert a fake nop, sometimes with no + * associated section so nothing to disassemble. + */ + fake_nop = (!insn->sec && insn->type == INSN_NOP); + if (fake_nop) { + snprintf(fake_nop_insn, 32, "<fake nop> (%d bytes)", insn->len); + insn_str = fake_nop_insn; + } else { + disas_insn(dctx, insn); + insn_str = disas_result(dctx); + } + + /* print the instruction */ + len = (depth + 1) * 2 < DISAS_INSN_SPACE ? DISAS_INSN_SPACE - (depth+1) * 2 : 1; + disas_print_info(stream, insn, depth, "%-*s", len, insn_str); + + /* print message if any */ + if (!format) + return; + + if (strcmp(format, "\n") == 0) { + fprintf(stream, "\n"); + return; + } + + fprintf(stream, " - "); + va_start(args, format); + vfprintf(stream, format, args); + va_end(args); +} + +/* + * Disassemble a single instruction. Return the size of the instruction. + * + * If alt_applied is true then insn should be an instruction from of an + * alternative (i.e. insn->alt_group != NULL), and it is disassembled + * at the location of the original code it is replacing. When the + * instruction references any address inside the alternative then + * these references will be re-adjusted to replace the original code. + */ +static size_t disas_insn_common(struct disas_context *dctx, + struct instruction *insn, + bool alt_applied) +{ + disassembler_ftype disasm = dctx->disassembler; + struct disassemble_info *dinfo = &dctx->info; + + dctx->insn = insn; + dctx->alt_applied = alt_applied; + dctx->result[0] = '\0'; + + if (insn->type == INSN_NOP) { + DINFO_FPRINTF(dinfo, "nop%d", insn->len); + return insn->len; + } + + /* + * Set the disassembler buffer to read data from the section + * containing the instruction to disassemble. + */ + dinfo->buffer = insn->sec->data->d_buf; + dinfo->buffer_vma = 0; + dinfo->buffer_length = insn->sec->sh.sh_size; + + return disasm(insn->offset, &dctx->info); +} + +size_t disas_insn(struct disas_context *dctx, struct instruction *insn) +{ + return disas_insn_common(dctx, insn, false); +} + +static size_t disas_insn_alt(struct disas_context *dctx, + struct instruction *insn) +{ + return disas_insn_common(dctx, insn, true); +} + +static struct instruction *next_insn_same_alt(struct objtool_file *file, + struct alt_group *alt_grp, + struct instruction *insn) +{ + if (alt_grp->last_insn == insn || alt_grp->nop == insn) + return NULL; + + return next_insn_same_sec(file, insn); +} + +#define alt_for_each_insn(file, alt_grp, insn) \ + for (insn = alt_grp->first_insn; \ + insn; \ + insn = next_insn_same_alt(file, alt_grp, insn)) + +/* + * Provide a name for the type of alternatives present at the + * specified instruction. + * + * An instruction can have alternatives with different types, for + * example alternative instructions and an exception table. In that + * case the name for the alternative instructions type is used. + * + * Return NULL if the instruction as no alternative. + */ +const char *disas_alt_type_name(struct instruction *insn) +{ + struct alternative *alt; + const char *name; + + name = NULL; + for (alt = insn->alts; alt; alt = alt->next) { + if (alt->type == ALT_TYPE_INSTRUCTIONS) { + name = "alternative"; + break; + } + + switch (alt->type) { + case ALT_TYPE_EX_TABLE: + name = "ex_table"; + break; + case ALT_TYPE_JUMP_TABLE: + name = "jump_table"; + break; + default: + name = "unknown"; + break; + } + } + + return name; +} + +/* + * Provide a name for an alternative. + */ +char *disas_alt_name(struct alternative *alt) +{ + char pfx[4] = { 0 }; + char *str = NULL; + const char *name; + int feature; + int flags; + int num; + + switch (alt->type) { + + case ALT_TYPE_EX_TABLE: + str = strdup("EXCEPTION"); + break; + + case ALT_TYPE_JUMP_TABLE: + str = strdup("JUMP"); + break; + + case ALT_TYPE_INSTRUCTIONS: + /* + * This is a non-default group alternative. Create a name + * based on the feature and flags associated with this + * alternative. Use either the feature name (it is available) + * or the feature number. And add a prefix to show the flags + * used. + * + * Prefix flags characters: + * + * '!' alternative used when feature not enabled + * '+' direct call alternative + * '?' unknown flag + */ + + if (!alt->insn->alt_group) + return NULL; + + feature = alt->insn->alt_group->feature; + num = alt_feature(feature); + flags = alt_flags(feature); + str = pfx; + + if (flags & ~(ALT_FLAG_NOT | ALT_FLAG_DIRECT_CALL)) + *str++ = '?'; + if (flags & ALT_FLAG_DIRECT_CALL) + *str++ = '+'; + if (flags & ALT_FLAG_NOT) + *str++ = '!'; + + name = arch_cpu_feature_name(num); + if (!name) + str = strfmt("%sFEATURE 0x%X", pfx, num); + else + str = strfmt("%s%s", pfx, name); + + break; + } + + return str; +} + +/* + * Initialize an alternative. The default alternative should be initialized + * with alt=NULL. + */ +static int disas_alt_init(struct disas_alt *dalt, + struct instruction *orig_insn, + struct alternative *alt) +{ + dalt->orig_insn = orig_insn; + dalt->alt = alt; + dalt->insn_idx = 0; + dalt->name = alt ? disas_alt_name(alt) : strdup("DEFAULT"); + if (!dalt->name) + return -1; + dalt->width = strlen(dalt->name); + + return 0; +} + +static int disas_alt_add_insn(struct disas_alt *dalt, int index, char *insn_str, + int offset, int nops) +{ + int len; + + if (index >= DISAS_ALT_INSN_MAX) { + WARN("Alternative %lx.%s has more instructions than supported", + DALT_ALTID(dalt), dalt->name); + return -1; + } + + len = strlen(insn_str); + dalt->insn[index].str = insn_str; + dalt->insn[index].offset = offset; + dalt->insn[index].nops = nops; + if (len > dalt->width) + dalt->width = len; + + return 0; +} + +static int disas_alt_jump(struct disas_alt *dalt) +{ + struct instruction *orig_insn; + struct instruction *dest_insn; + char suffix[2] = { 0 }; + char *str; + int nops; + + orig_insn = dalt->orig_insn; + dest_insn = dalt->alt->insn; + + if (orig_insn->type == INSN_NOP) { + if (orig_insn->len == 5) + suffix[0] = 'q'; + str = strfmt("jmp%-3s %lx <%s+0x%lx>", suffix, + dest_insn->offset, dest_insn->sym->name, + dest_insn->offset - dest_insn->sym->offset); + nops = 0; + } else { + str = strfmt("nop%d", orig_insn->len); + nops = orig_insn->len; + } + + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, nops); + + return 1; +} + +/* + * Disassemble an exception table alternative. + */ +static int disas_alt_extable(struct disas_alt *dalt) +{ + struct instruction *alt_insn; + char *str; + + alt_insn = dalt->alt->insn; + str = strfmt("resume at 0x%lx <%s+0x%lx>", + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + if (!str) + return -1; + + disas_alt_add_insn(dalt, 0, str, 0, 0); + + return 1; +} + +/* + * Disassemble an alternative and store instructions in the disas_alt + * structure. Return the number of instructions in the alternative. + */ +static int disas_alt_group(struct disas_context *dctx, struct disas_alt *dalt) +{ + struct objtool_file *file; + struct instruction *insn; + int offset; + char *str; + int count; + int nops; + int err; + + file = dctx->file; + count = 0; + offset = 0; + nops = 0; + + alt_for_each_insn(file, DALT_GROUP(dalt), insn) { + + disas_insn_alt(dctx, insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + + nops = insn->type == INSN_NOP ? insn->len : 0; + err = disas_alt_add_insn(dalt, count, str, offset, nops); + if (err) + break; + offset += insn->len; + count++; + } + + return count; +} + +/* + * Disassemble the default alternative. + */ +static int disas_alt_default(struct disas_context *dctx, struct disas_alt *dalt) +{ + char *str; + int nops; + int err; + + if (DALT_GROUP(dalt)) + return disas_alt_group(dctx, dalt); + + /* + * Default alternative with no alt_group: this is the default + * code associated with either a jump table or an exception + * table and no other instruction alternatives. In that case + * the default alternative is made of a single instruction. + */ + disas_insn(dctx, dalt->orig_insn); + str = strdup(disas_result(dctx)); + if (!str) + return -1; + nops = dalt->orig_insn->type == INSN_NOP ? dalt->orig_insn->len : 0; + err = disas_alt_add_insn(dalt, 0, str, 0, nops); + if (err) + return -1; + + return 1; +} + +/* + * For each alternative, if there is an instruction at the specified + * offset then print this instruction, otherwise print a blank entry. + * The offset is an offset from the start of the alternative. + * + * Return the offset for the next instructions to print, or -1 if all + * instructions have been printed. + */ +static int disas_alt_print_insn(struct disas_alt *dalts, int alt_count, + int insn_count, int offset) +{ + struct disas_alt *dalt; + int offset_next; + char *str; + int i, j; + + offset_next = -1; + + for (i = 0; i < alt_count; i++) { + dalt = &dalts[i]; + j = dalt->insn_idx; + if (j == -1) { + printf("| %-*s ", dalt->width, ""); + continue; + } + + if (dalt->insn[j].offset == offset) { + str = dalt->insn[j].str; + printf("| %-*s ", dalt->width, str ?: ""); + if (++j < insn_count) { + dalt->insn_idx = j; + } else { + dalt->insn_idx = -1; + continue; + } + } else { + printf("| %-*s ", dalt->width, ""); + } + + if (dalt->insn[j].offset > 0 && + (offset_next == -1 || + (dalt->insn[j].offset < offset_next))) + offset_next = dalt->insn[j].offset; + } + printf("\n"); + + return offset_next; +} + +/* + * Print all alternatives side-by-side. + */ +static void disas_alt_print_wide(char *alt_name, struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct instruction *orig_insn; + int offset_next; + int offset; + int i; + + orig_insn = dalts[0].orig_insn; + + /* + * Print an header with the name of each alternative. + */ + disas_print_info(stdout, orig_insn, -2, NULL); + + if (strlen(alt_name) > dalts[0].width) + dalts[0].width = strlen(alt_name); + printf("| %-*s ", dalts[0].width, alt_name); + + for (i = 1; i < alt_count; i++) + printf("| %-*s ", dalts[i].width, dalts[i].name); + + printf("\n"); + + /* + * Print instructions for each alternative. + */ + offset_next = 0; + do { + offset = offset_next; + disas_print(stdout, orig_insn->sec, orig_insn->offset + offset, + -2, NULL); + offset_next = disas_alt_print_insn(dalts, alt_count, insn_count, + offset); + } while (offset_next > offset); +} + +/* + * Print all alternatives one above the other. + */ +static void disas_alt_print_compact(char *alt_name, struct disas_alt *dalts, + int alt_count, int insn_count) +{ + struct instruction *orig_insn; + int width; + int i, j; + int len; + + orig_insn = dalts[0].orig_insn; + + len = disas_print(stdout, orig_insn->sec, orig_insn->offset, 0, NULL); + printf("%s\n", alt_name); + + /* + * If all alternatives have a single instruction then print each + * alternative on a single line. Otherwise, print alternatives + * one above the other with a clear separation. + */ + + if (insn_count == 1) { + width = 0; + for (i = 0; i < alt_count; i++) { + if (dalts[i].width > width) + width = dalts[i].width; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %-*s (if %s)\n", len, "", width, + dalts[i].insn[0].str, dalts[i].name); + } + + return; + } + + for (i = 0; i < alt_count; i++) { + printf("%*s= %s\n", len, "", dalts[i].name); + for (j = 0; j < insn_count; j++) { + if (!dalts[i].insn[j].str) + break; + disas_print(stdout, orig_insn->sec, + orig_insn->offset + dalts[i].insn[j].offset, 0, + "| %s\n", dalts[i].insn[j].str); + } + printf("%*s|\n", len, ""); + } +} + +/* + * Trim NOPs in alternatives. This replaces trailing NOPs in alternatives + * with a single indication of the number of bytes covered with NOPs. + * + * Return the maximum numbers of instructions in all alternatives after + * trailing NOPs have been trimmed. + */ +static int disas_alt_trim_nops(struct disas_alt *dalts, int alt_count, + int insn_count) +{ + struct disas_alt *dalt; + int nops_count; + const char *s; + int offset; + int count; + int nops; + int i, j; + + count = 0; + for (i = 0; i < alt_count; i++) { + offset = 0; + nops = 0; + nops_count = 0; + dalt = &dalts[i]; + for (j = insn_count - 1; j >= 0; j--) { + if (!dalt->insn[j].str || !dalt->insn[j].nops) + break; + offset = dalt->insn[j].offset; + free(dalt->insn[j].str); + dalt->insn[j].offset = 0; + dalt->insn[j].str = NULL; + nops += dalt->insn[j].nops; + nops_count++; + } + + /* + * All trailing NOPs have been removed. If there was a single + * NOP instruction then re-add it. If there was a block of + * NOPs then indicate the number of bytes than the block + * covers (nop*<number-of-bytes>). + */ + if (nops_count) { + s = nops_count == 1 ? "" : "*"; + dalt->insn[j + 1].str = strfmt("nop%s%d", s, nops); + dalt->insn[j + 1].offset = offset; + dalt->insn[j + 1].nops = nops; + j++; + } + + if (j > count) + count = j; + } + + return count + 1; +} + +/* + * Disassemble an alternative. + * + * Return the last instruction in the default alternative so that + * disassembly can continue with the next instruction. Return NULL + * on error. + */ +static void *disas_alt(struct disas_context *dctx, + struct instruction *orig_insn) +{ + struct disas_alt dalts[DISAS_ALT_MAX] = { 0 }; + struct instruction *last_insn = NULL; + struct alternative *alt; + struct disas_alt *dalt; + int insn_count = 0; + int alt_count = 0; + char *alt_name; + int count; + int i, j; + int err; + + alt_name = strfmt("<%s.%lx>", disas_alt_type_name(orig_insn), + orig_insn->offset); + if (!alt_name) { + WARN("Failed to define name for alternative at instruction 0x%lx", + orig_insn->offset); + goto done; + } + + /* + * Initialize and disassemble the default alternative. + */ + err = disas_alt_init(&dalts[0], orig_insn, NULL); + if (err) { + WARN("%s: failed to initialize default alternative", alt_name); + goto done; + } + + insn_count = disas_alt_default(dctx, &dalts[0]); + if (insn_count < 0) { + WARN("%s: failed to disassemble default alternative", alt_name); + goto done; + } + + /* + * Initialize and disassemble all other alternatives. + */ + i = 1; + for (alt = orig_insn->alts; alt; alt = alt->next) { + if (i >= DISAS_ALT_MAX) { + WARN("%s has more alternatives than supported", alt_name); + break; + } + + dalt = &dalts[i]; + err = disas_alt_init(dalt, orig_insn, alt); + if (err) { + WARN("%s: failed to disassemble alternative", alt_name); + goto done; + } + + count = -1; + switch (dalt->alt->type) { + case ALT_TYPE_INSTRUCTIONS: + count = disas_alt_group(dctx, dalt); + break; + case ALT_TYPE_EX_TABLE: + count = disas_alt_extable(dalt); + break; + case ALT_TYPE_JUMP_TABLE: + count = disas_alt_jump(dalt); + break; + } + if (count < 0) { + WARN("%s: failed to disassemble alternative %s", + alt_name, dalt->name); + goto done; + } + + insn_count = count > insn_count ? count : insn_count; + i++; + } + alt_count = i; + + /* + * Print default and non-default alternatives. + */ + + insn_count = disas_alt_trim_nops(dalts, alt_count, insn_count); + + if (opts.wide) + disas_alt_print_wide(alt_name, dalts, alt_count, insn_count); + else + disas_alt_print_compact(alt_name, dalts, alt_count, insn_count); + + last_insn = orig_insn->alt_group ? orig_insn->alt_group->last_insn : + orig_insn; + +done: + for (i = 0; i < alt_count; i++) { + free(dalts[i].name); + for (j = 0; j < insn_count; j++) + free(dalts[i].insn[j].str); + } + + free(alt_name); + + return last_insn; +} + +/* + * Disassemble a function. + */ +static void disas_func(struct disas_context *dctx, struct symbol *func) +{ + struct instruction *insn_start; + struct instruction *insn; + + printf("%s:\n", func->name); + sym_for_each_insn(dctx->file, func, insn) { + if (insn->alts) { + insn_start = insn; + insn = disas_alt(dctx, insn); + if (insn) + continue; + /* + * There was an error with disassembling + * the alternative. Resume disassembling + * at the current instruction, this will + * disassemble the default alternative + * only and continue with the code after + * the alternative. + */ + insn = insn_start; + } + + DISAS_PRINSN(dctx, insn, 0); + } + printf("\n"); +} + +/* + * Disassemble all warned functions. + */ +void disas_warned_funcs(struct disas_context *dctx) +{ + struct symbol *sym; + + if (!dctx) + return; + + for_each_sym(dctx->file->elf, sym) { + if (sym->warned) + disas_func(dctx, sym); + } +} + +void disas_funcs(struct disas_context *dctx) +{ + bool disas_all = !strcmp(opts.disas, "*"); + struct section *sec; + struct symbol *sym; + + for_each_sec(dctx->file->elf, sec) { + + if (!(sec->sh.sh_flags & SHF_EXECINSTR)) + continue; + + sec_for_each_sym(sec, sym) { + /* + * If the function had a warning and the verbose + * option is used then the function was already + * disassemble. + */ + if (opts.verbose && sym->warned) + continue; + + if (disas_all || fnmatch(opts.disas, sym->name, 0) == 0) + disas_func(dctx, sym); + } + } +} diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index ca5d77db692a..6a8ed9c62323 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -16,12 +16,17 @@ #include <string.h> #include <unistd.h> #include <errno.h> +#include <libgen.h> +#include <ctype.h> #include <linux/interval_tree_generic.h> #include <objtool/builtin.h> - #include <objtool/elf.h> #include <objtool/warn.h> +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_UP_POW2(x) (1U << ((8 * sizeof(x)) - __builtin_clz((x) - 1U))) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + static inline u32 str_hash(const char *str) { return jhash(str, strlen(str), 0); @@ -92,11 +97,12 @@ static inline unsigned long __sym_start(struct symbol *s) static inline unsigned long __sym_last(struct symbol *s) { - return s->offset + s->len - 1; + return s->offset + (s->len ? s->len - 1 : 0); } INTERVAL_TREE_DEFINE(struct symbol, node, unsigned long, __subtree_last, - __sym_start, __sym_last, static, __sym) + __sym_start, __sym_last, static inline __maybe_unused, + __sym) #define __sym_for_each(_iter, _tree, _start, _end) \ for (_iter = __sym_iter_first((_tree), (_start), (_end)); \ @@ -108,7 +114,7 @@ struct symbol_hole { }; /* - * Find !section symbol where @offset is after it. + * Find the last symbol before @offset. */ static int symbol_hole_by_offset(const void *key, const struct rb_node *node) { @@ -119,8 +125,7 @@ static int symbol_hole_by_offset(const void *key, const struct rb_node *node) return -1; if (sh->key >= s->offset + s->len) { - if (s->type != STT_SECTION) - sh->sym = s; + sh->sym = s; return 1; } @@ -167,11 +172,11 @@ static struct symbol *find_symbol_by_index(struct elf *elf, unsigned int idx) struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *sym; - __sym_for_each(iter, tree, offset, offset) { - if (iter->offset == offset && iter->type != STT_SECTION) - return iter; + __sym_for_each(sym, tree, offset, offset) { + if (sym->offset == offset && !is_sec_sym(sym)) + return sym->alias; } return NULL; @@ -180,11 +185,11 @@ struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset) struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *func; - __sym_for_each(iter, tree, offset, offset) { - if (iter->offset == offset && iter->type == STT_FUNC) - return iter; + __sym_for_each(func, tree, offset, offset) { + if (func->offset == offset && is_func_sym(func)) + return func->alias; } return NULL; @@ -193,14 +198,29 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset) struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *sym = NULL, *tmp; - __sym_for_each(iter, tree, offset, offset) { - if (iter->type != STT_SECTION) - return iter; + __sym_for_each(tmp, tree, offset, offset) { + if (tmp->len) { + if (!sym) { + sym = tmp; + continue; + } + + if (sym->offset != tmp->offset || sym->len != tmp->len) { + /* + * In the rare case of overlapping symbols, + * pick the smaller one. + * + * TODO: outlaw overlapping symbols + */ + if (tmp->len < sym->len) + sym = tmp; + } + } } - return NULL; + return sym ? sym->alias : NULL; } /* @@ -246,11 +266,11 @@ int find_symbol_hole_containing(const struct section *sec, unsigned long offset) struct symbol *find_func_containing(struct section *sec, unsigned long offset) { struct rb_root_cached *tree = (struct rb_root_cached *)&sec->symbol_tree; - struct symbol *iter; + struct symbol *func; - __sym_for_each(iter, tree, offset, offset) { - if (iter->type == STT_FUNC) - return iter; + __sym_for_each(func, tree, offset, offset) { + if (is_func_sym(func)) + return func->alias; } return NULL; @@ -268,6 +288,35 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) return NULL; } +/* Find local symbol with matching STT_FILE */ +static struct symbol *find_local_symbol_by_file_and_name(const struct elf *elf, + struct symbol *file, + const char *name) +{ + struct symbol *sym; + + elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) { + if (sym->bind == STB_LOCAL && sym->file == file && + !strcmp(sym->name, name)) { + return sym; + } + } + + return NULL; +} + +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name) +{ + struct symbol *sym; + + elf_hash_for_each_possible(symbol_name, sym, name_hash, str_hash(name)) { + if (!strcmp(sym->name, name) && !is_local_sym(sym)) + return sym; + } + + return NULL; +} + struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { @@ -358,14 +407,14 @@ static int read_sections(struct elf *elf) return -1; } - if (sec->sh.sh_size != 0 && !is_dwarf_section(sec)) { + if (sec_size(sec) != 0 && !is_dwarf_section(sec)) { sec->data = elf_getdata(s, NULL); if (!sec->data) { ERROR_ELF("elf_getdata"); return -1; } if (sec->data->d_off != 0 || - sec->data->d_size != sec->sh.sh_size) { + sec->data->d_size != sec_size(sec)) { ERROR("unexpected data attributes for %s", sec->name); return -1; } @@ -393,7 +442,38 @@ static int read_sections(struct elf *elf) return 0; } -static void elf_add_symbol(struct elf *elf, struct symbol *sym) +static const char *demangle_name(struct symbol *sym) +{ + char *str; + + if (!is_local_sym(sym)) + return sym->name; + + if (!is_func_sym(sym) && !is_object_sym(sym)) + return sym->name; + + if (!strstarts(sym->name, "__UNIQUE_ID_") && !strchr(sym->name, '.')) + return sym->name; + + str = strdup(sym->name); + if (!str) { + ERROR_GLIBC("strdup"); + return NULL; + } + + for (int i = strlen(str) - 1; i >= 0; i--) { + char c = str[i]; + + if (!isdigit(c) && c != '.') { + str[i + 1] = '\0'; + break; + } + } + + return str; +} + +static int elf_add_symbol(struct elf *elf, struct symbol *sym) { struct list_head *entry; struct rb_node *pnode; @@ -405,14 +485,15 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) sym->type = GELF_ST_TYPE(sym->sym.st_info); sym->bind = GELF_ST_BIND(sym->sym.st_info); - if (sym->type == STT_FILE) + if (is_file_sym(sym)) elf->num_files++; sym->offset = sym->sym.st_value; sym->len = sym->sym.st_size; __sym_for_each(iter, &sym->sec->symbol_tree, sym->offset, sym->offset) { - if (iter->offset == sym->offset && iter->type == sym->type) + if (!is_undef_sym(iter) && iter->offset == sym->offset && + iter->type == sym->type && iter->len == sym->len) iter->alias = sym; } @@ -423,21 +504,44 @@ static void elf_add_symbol(struct elf *elf, struct symbol *sym) else entry = &sym->sec->symbol_list; list_add(&sym->list, entry); + + list_add_tail(&sym->global_list, &elf->symbols); elf_hash_add(symbol, &sym->hash, sym->idx); elf_hash_add(symbol_name, &sym->name_hash, str_hash(sym->name)); - /* - * Don't store empty STT_NOTYPE symbols in the rbtree. They - * can exist within a function, confusing the sorting. - */ - if (!sym->len) - __sym_remove(sym, &sym->sec->symbol_tree); + if (is_func_sym(sym) && + (strstarts(sym->name, "__pfx_") || + strstarts(sym->name, "__cfi_") || + strstarts(sym->name, "__pi___pfx_") || + strstarts(sym->name, "__pi___cfi_"))) + sym->prefix = 1; + + if (strstarts(sym->name, ".klp.sym")) + sym->klp = 1; + + if (!sym->klp && !is_sec_sym(sym) && strstr(sym->name, ".cold")) { + sym->cold = 1; + + /* + * Clang doesn't mark cold subfunctions as STT_FUNC, which + * breaks several objtool assumptions. Fake it. + */ + sym->type = STT_FUNC; + } + + sym->pfunc = sym->cfunc = sym; + + sym->demangled_name = demangle_name(sym); + if (!sym->demangled_name) + return -1; + + return 0; } static int read_symbols(struct elf *elf) { struct section *symtab, *symtab_shndx, *sec; - struct symbol *sym, *pfunc; + struct symbol *sym, *pfunc, *file = NULL; int symbols_nr, i; char *coldstr; Elf_Data *shndx_data = NULL; @@ -469,6 +573,9 @@ static int read_symbols(struct elf *elf) ERROR_GLIBC("calloc"); return -1; } + + INIT_LIST_HEAD(&elf->symbols); + for (i = 0; i < symbols_nr; i++) { sym = &elf->symbol_data[i]; @@ -477,14 +584,14 @@ static int read_symbols(struct elf *elf) if (!gelf_getsymshndx(symtab->data, shndx_data, i, &sym->sym, &shndx)) { ERROR_ELF("gelf_getsymshndx"); - goto err; + return -1; } sym->name = elf_strptr(elf->elf, symtab->sh.sh_link, sym->sym.st_name); if (!sym->name) { ERROR_ELF("elf_strptr"); - goto err; + return -1; } if ((sym->sym.st_shndx > SHN_UNDEF && @@ -496,7 +603,7 @@ static int read_symbols(struct elf *elf) sym->sec = find_section_by_index(elf, shndx); if (!sym->sec) { ERROR("couldn't find section for symbol %s", sym->name); - goto err; + return -1; } if (GELF_ST_TYPE(sym->sym.st_info) == STT_SECTION) { sym->name = sym->sec->name; @@ -505,7 +612,13 @@ static int read_symbols(struct elf *elf) } else sym->sec = find_section_by_index(elf, 0); - elf_add_symbol(elf, sym); + if (elf_add_symbol(elf, sym)) + return -1; + + if (sym->type == STT_FILE) + file = sym; + else if (sym->bind == STB_LOCAL) + sym->file = file; } if (opts.stats) { @@ -518,18 +631,15 @@ static int read_symbols(struct elf *elf) sec_for_each_sym(sec, sym) { char *pname; size_t pnamelen; - if (sym->type != STT_FUNC) - continue; - - if (sym->pfunc == NULL) - sym->pfunc = sym; - if (sym->cfunc == NULL) - sym->cfunc = sym; + if (!sym->cold) + continue; coldstr = strstr(sym->name, ".cold"); - if (!coldstr) - continue; + if (!coldstr) { + ERROR("%s(): cold subfunction without \".cold\"?", sym->name); + return -1; + } pnamelen = coldstr - sym->name; pname = strndup(sym->name, pnamelen); @@ -538,7 +648,9 @@ static int read_symbols(struct elf *elf) return -1; } - pfunc = find_symbol_by_name(elf, pname); + pfunc = find_local_symbol_by_file_and_name(elf, sym->file, pname); + if (!pfunc) + pfunc = find_global_symbol_by_name(elf, pname); free(pname); if (!pfunc) { @@ -546,8 +658,9 @@ static int read_symbols(struct elf *elf) return -1; } - sym->pfunc = pfunc; + sym->pfunc = pfunc->alias; pfunc->cfunc = sym; + pfunc->alias->cfunc = sym; /* * Unfortunately, -fnoreorder-functions puts the child @@ -566,10 +679,6 @@ static int read_symbols(struct elf *elf) } return 0; - -err: - free(sym); - return -1; } static int mark_group_syms(struct elf *elf) @@ -583,7 +692,7 @@ static int mark_group_syms(struct elf *elf) return -1; } - list_for_each_entry(sec, &elf->sections, list) { + for_each_sec(elf, sec) { if (sec->sh.sh_type == SHT_GROUP && sec->sh.sh_link == symtab->idx) { sym = find_symbol_by_index(elf, sec->sh.sh_info); @@ -624,7 +733,7 @@ static int elf_update_sym_relocs(struct elf *elf, struct symbol *sym) static int elf_update_symbol(struct elf *elf, struct section *symtab, struct section *symtab_shndx, struct symbol *sym) { - Elf32_Word shndx = sym->sec ? sym->sec->idx : SHN_UNDEF; + Elf32_Word shndx; Elf_Data *symtab_data = NULL, *shndx_data = NULL; Elf64_Xword entsize = symtab->sh.sh_entsize; int max_idx, idx = sym->idx; @@ -632,8 +741,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, bool is_special_shndx = sym->sym.st_shndx >= SHN_LORESERVE && sym->sym.st_shndx != SHN_XINDEX; - if (is_special_shndx) - shndx = sym->sym.st_shndx; + shndx = is_special_shndx ? sym->sym.st_shndx : sym->sec->idx; s = elf_getscn(elf->elf, symtab->idx); if (!s) { @@ -731,7 +839,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, } /* setup extended section index magic and write the symbol */ - if ((shndx >= SHN_UNDEF && shndx < SHN_LORESERVE) || is_special_shndx) { + if (shndx < SHN_LORESERVE || is_special_shndx) { sym->sym.st_shndx = shndx; if (!shndx_data) shndx = 0; @@ -751,24 +859,58 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab, return 0; } -static struct symbol * -__elf_create_symbol(struct elf *elf, struct symbol *sym) +struct symbol *elf_create_symbol(struct elf *elf, const char *name, + struct section *sec, unsigned int bind, + unsigned int type, unsigned long offset, + size_t size) { struct section *symtab, *symtab_shndx; Elf32_Word first_non_local, new_idx; - struct symbol *old; + struct symbol *old, *sym; - symtab = find_section_by_name(elf, ".symtab"); - if (symtab) { - symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + sym = calloc(1, sizeof(*sym)); + if (!sym) { + ERROR_GLIBC("calloc"); + return NULL; + } + + sym->name = strdup(name); + if (!sym->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + if (type != STT_SECTION) { + sym->sym.st_name = elf_add_string(elf, NULL, sym->name); + if (sym->sym.st_name == -1) + return NULL; + } + + if (sec) { + sym->sec = sec; } else { + sym->sec = find_section_by_index(elf, 0); + if (!sym->sec) { + ERROR("no NULL section"); + return NULL; + } + } + + sym->sym.st_info = GELF_ST_INFO(bind, type); + sym->sym.st_value = offset; + sym->sym.st_size = size; + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { ERROR("no .symtab"); return NULL; } + symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); + new_idx = sec_num_entries(symtab); - if (GELF_ST_BIND(sym->sym.st_info) != STB_LOCAL) + if (bind != STB_LOCAL) goto non_local; /* @@ -806,10 +948,8 @@ __elf_create_symbol(struct elf *elf, struct symbol *sym) non_local: sym->idx = new_idx; - if (elf_update_symbol(elf, symtab, symtab_shndx, sym)) { - ERROR("elf_update_symbol"); + if (sym->idx && elf_update_symbol(elf, symtab, symtab_shndx, sym)) return NULL; - } symtab->sh.sh_size += symtab->sh.sh_entsize; mark_sec_changed(elf, symtab, true); @@ -819,70 +959,28 @@ non_local: mark_sec_changed(elf, symtab_shndx, true); } - return sym; -} - -static struct symbol * -elf_create_section_symbol(struct elf *elf, struct section *sec) -{ - struct symbol *sym = calloc(1, sizeof(*sym)); - - if (!sym) { - ERROR_GLIBC("malloc"); + if (elf_add_symbol(elf, sym)) return NULL; - } - - sym->name = sec->name; - sym->sec = sec; - - // st_name 0 - sym->sym.st_info = GELF_ST_INFO(STB_LOCAL, STT_SECTION); - // st_other 0 - // st_value 0 - // st_size 0 - - sym = __elf_create_symbol(elf, sym); - if (sym) - elf_add_symbol(elf, sym); return sym; } -static int elf_add_string(struct elf *elf, struct section *strtab, char *str); - -struct symbol * -elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size) +struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec) { struct symbol *sym = calloc(1, sizeof(*sym)); - size_t namelen = strlen(orig->name) + sizeof("__pfx_"); - char *name = malloc(namelen); - if (!sym || !name) { - ERROR_GLIBC("malloc"); + sym = elf_create_symbol(elf, sec->name, sec, STB_LOCAL, STT_SECTION, 0, 0); + if (!sym) return NULL; - } - snprintf(name, namelen, "__pfx_%s", orig->name); - - sym->name = name; - sym->sec = orig->sec; - - sym->sym.st_name = elf_add_string(elf, NULL, name); - sym->sym.st_info = orig->sym.st_info; - sym->sym.st_value = orig->sym.st_value - size; - sym->sym.st_size = size; - - sym = __elf_create_symbol(elf, sym); - if (sym) - elf_add_symbol(elf, sym); + sec->sym = sym; return sym; } -static struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, - unsigned int reloc_idx, - unsigned long offset, struct symbol *sym, - s64 addend, unsigned int type) +struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, + unsigned int reloc_idx, unsigned long offset, + struct symbol *sym, s64 addend, unsigned int type) { struct reloc *reloc, empty = { 0 }; @@ -922,9 +1020,9 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, unsigned long insn_off) { struct symbol *sym = insn_sec->sym; - int addend = insn_off; + s64 addend = insn_off; - if (!(insn_sec->sh.sh_flags & SHF_EXECINSTR)) { + if (!is_text_sec(insn_sec)) { ERROR("bad call to %s() for data symbol %s", __func__, sym->name); return NULL; } @@ -939,8 +1037,6 @@ struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, sym = elf_create_section_symbol(elf, insn_sec); if (!sym) return NULL; - - insn_sec->sym = sym; } return elf_init_reloc(elf, sec->rsec, reloc_idx, offset, sym, addend, @@ -953,7 +1049,7 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec, struct symbol *sym, s64 addend) { - if (sym->sec && (sec->sh.sh_flags & SHF_EXECINSTR)) { + if (is_text_sec(sec)) { ERROR("bad call to %s() for text symbol %s", __func__, sym->name); return NULL; } @@ -986,12 +1082,16 @@ static int read_relocs(struct elf *elf) rsec->base->rsec = rsec; - nr_reloc = 0; + /* nr_alloc_relocs=0: libelf owns d_buf */ + rsec->nr_alloc_relocs = 0; + rsec->relocs = calloc(sec_num_entries(rsec), sizeof(*reloc)); if (!rsec->relocs) { ERROR_GLIBC("calloc"); return -1; } + + nr_reloc = 0; for (i = 0; i < sec_num_entries(rsec); i++) { reloc = &rsec->relocs[i]; @@ -1044,6 +1144,12 @@ struct elf *elf_open_read(const char *name, int flags) goto err; } + elf->name = strdup(name); + if (!elf->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + if ((flags & O_ACCMODE) == O_RDONLY) cmd = ELF_C_READ_MMAP; else if ((flags & O_ACCMODE) == O_RDWR) @@ -1081,11 +1187,142 @@ err: return NULL; } -static int elf_add_string(struct elf *elf, struct section *strtab, char *str) +struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name) { - Elf_Data *data; - Elf_Scn *s; - int len; + struct section *null, *symtab, *strtab, *shstrtab; + char *dir, *base, *tmp_name; + struct symbol *sym; + struct elf *elf; + + elf_version(EV_CURRENT); + + elf = calloc(1, sizeof(*elf)); + if (!elf) { + ERROR_GLIBC("calloc"); + return NULL; + } + + INIT_LIST_HEAD(&elf->sections); + + dir = strdup(name); + if (!dir) { + ERROR_GLIBC("strdup"); + return NULL; + } + + dir = dirname(dir); + + base = strdup(name); + if (!base) { + ERROR_GLIBC("strdup"); + return NULL; + } + + base = basename(base); + + tmp_name = malloc(256); + if (!tmp_name) { + ERROR_GLIBC("malloc"); + return NULL; + } + + snprintf(tmp_name, 256, "%s/%s.XXXXXX", dir, base); + + elf->fd = mkstemp(tmp_name); + if (elf->fd == -1) { + ERROR_GLIBC("can't create tmp file"); + exit(1); + } + + elf->tmp_name = tmp_name; + + elf->name = strdup(name); + if (!elf->name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + elf->elf = elf_begin(elf->fd, ELF_C_WRITE, NULL); + if (!elf->elf) { + ERROR_ELF("elf_begin"); + return NULL; + } + + if (!gelf_newehdr(elf->elf, ELFCLASS64)) { + ERROR_ELF("gelf_newehdr"); + return NULL; + } + + memcpy(&elf->ehdr, ehdr, sizeof(elf->ehdr)); + + if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) { + ERROR_ELF("gelf_update_ehdr"); + return NULL; + } + + INIT_LIST_HEAD(&elf->symbols); + + if (!elf_alloc_hash(section, 1000) || + !elf_alloc_hash(section_name, 1000) || + !elf_alloc_hash(symbol, 10000) || + !elf_alloc_hash(symbol_name, 10000) || + !elf_alloc_hash(reloc, 100000)) + return NULL; + + null = elf_create_section(elf, NULL, 0, 0, SHT_NULL, 0, 0); + shstrtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0); + strtab = elf_create_section(elf, NULL, 0, 0, SHT_STRTAB, 1, 0); + + if (!null || !shstrtab || !strtab) + return NULL; + + null->name = ""; + shstrtab->name = ".shstrtab"; + strtab->name = ".strtab"; + + null->sh.sh_name = elf_add_string(elf, shstrtab, null->name); + shstrtab->sh.sh_name = elf_add_string(elf, shstrtab, shstrtab->name); + strtab->sh.sh_name = elf_add_string(elf, shstrtab, strtab->name); + + if (null->sh.sh_name == -1 || shstrtab->sh.sh_name == -1 || strtab->sh.sh_name == -1) + return NULL; + + elf_hash_add(section_name, &null->name_hash, str_hash(null->name)); + elf_hash_add(section_name, &strtab->name_hash, str_hash(strtab->name)); + elf_hash_add(section_name, &shstrtab->name_hash, str_hash(shstrtab->name)); + + if (elf_add_string(elf, strtab, "") == -1) + return NULL; + + symtab = elf_create_section(elf, ".symtab", 0x18, 0x18, SHT_SYMTAB, 0x8, 0); + if (!symtab) + return NULL; + + symtab->sh.sh_link = strtab->idx; + symtab->sh.sh_info = 1; + + elf->ehdr.e_shstrndx = shstrtab->idx; + if (!gelf_update_ehdr(elf->elf, &elf->ehdr)) { + ERROR_ELF("gelf_update_ehdr"); + return NULL; + } + + sym = calloc(1, sizeof(*sym)); + if (!sym) { + ERROR_GLIBC("calloc"); + return NULL; + } + + sym->name = ""; + sym->sec = null; + elf_add_symbol(elf, sym); + + return elf; +} + +unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str) +{ + unsigned int offset; if (!strtab) strtab = find_section_by_name(elf, ".strtab"); @@ -1094,76 +1331,109 @@ static int elf_add_string(struct elf *elf, struct section *strtab, char *str) return -1; } - s = elf_getscn(elf->elf, strtab->idx); + if (!strtab->sh.sh_addralign) { + ERROR("'%s': invalid sh_addralign", strtab->name); + return -1; + } + + offset = ALIGN_UP(strtab->sh.sh_size, strtab->sh.sh_addralign); + + if (!elf_add_data(elf, strtab, str, strlen(str) + 1)) + return -1; + + return offset; +} + +void *elf_add_data(struct elf *elf, struct section *sec, const void *data, size_t size) +{ + unsigned long offset; + Elf_Scn *s; + + if (!sec->sh.sh_addralign) { + ERROR("'%s': invalid sh_addralign", sec->name); + return NULL; + } + + s = elf_getscn(elf->elf, sec->idx); if (!s) { ERROR_ELF("elf_getscn"); - return -1; + return NULL; } - data = elf_newdata(s); - if (!data) { + sec->data = elf_newdata(s); + if (!sec->data) { ERROR_ELF("elf_newdata"); - return -1; + return NULL; } - data->d_buf = str; - data->d_size = strlen(str) + 1; - data->d_align = 1; + sec->data->d_buf = calloc(1, size); + if (!sec->data->d_buf) { + ERROR_GLIBC("calloc"); + return NULL; + } - len = strtab->sh.sh_size; - strtab->sh.sh_size += data->d_size; + if (data) + memcpy(sec->data->d_buf, data, size); - mark_sec_changed(elf, strtab, true); + sec->data->d_size = size; + sec->data->d_align = 1; - return len; + offset = ALIGN_UP(sec->sh.sh_size, sec->sh.sh_addralign); + sec->sh.sh_size = offset + size; + + mark_sec_changed(elf, sec, true); + + return sec->data->d_buf; } struct section *elf_create_section(struct elf *elf, const char *name, - size_t entsize, unsigned int nr) + size_t size, size_t entsize, + unsigned int type, unsigned int align, + unsigned int flags) { struct section *sec, *shstrtab; - size_t size = entsize * nr; Elf_Scn *s; - sec = malloc(sizeof(*sec)); + if (name && find_section_by_name(elf, name)) { + ERROR("section '%s' already exists", name); + return NULL; + } + + sec = calloc(1, sizeof(*sec)); if (!sec) { - ERROR_GLIBC("malloc"); + ERROR_GLIBC("calloc"); return NULL; } - memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); + /* don't actually create the section, just the data structures */ + if (type == SHT_NULL) + goto add; + s = elf_newscn(elf->elf); if (!s) { ERROR_ELF("elf_newscn"); return NULL; } - sec->name = strdup(name); - if (!sec->name) { - ERROR_GLIBC("strdup"); - return NULL; - } - sec->idx = elf_ndxscn(s); - sec->data = elf_newdata(s); - if (!sec->data) { - ERROR_ELF("elf_newdata"); - return NULL; - } + if (size) { + sec->data = elf_newdata(s); + if (!sec->data) { + ERROR_ELF("elf_newdata"); + return NULL; + } - sec->data->d_size = size; - sec->data->d_align = 1; + sec->data->d_size = size; + sec->data->d_align = 1; - if (size) { - sec->data->d_buf = malloc(size); + sec->data->d_buf = calloc(1, size); if (!sec->data->d_buf) { - ERROR_GLIBC("malloc"); + ERROR_GLIBC("calloc"); return NULL; } - memset(sec->data->d_buf, 0, size); } if (!gelf_getshdr(s, &sec->sh)) { @@ -1173,34 +1443,152 @@ struct section *elf_create_section(struct elf *elf, const char *name, sec->sh.sh_size = size; sec->sh.sh_entsize = entsize; - sec->sh.sh_type = SHT_PROGBITS; - sec->sh.sh_addralign = 1; - sec->sh.sh_flags = SHF_ALLOC; - - /* Add section name to .shstrtab (or .strtab for Clang) */ - shstrtab = find_section_by_name(elf, ".shstrtab"); - if (!shstrtab) - shstrtab = find_section_by_name(elf, ".strtab"); - if (!shstrtab) { - ERROR("can't find .shstrtab or .strtab section"); - return NULL; + sec->sh.sh_type = type; + sec->sh.sh_addralign = align; + sec->sh.sh_flags = flags; + + if (name) { + sec->name = strdup(name); + if (!sec->name) { + ERROR("strdup"); + return NULL; + } + + /* Add section name to .shstrtab (or .strtab for Clang) */ + shstrtab = find_section_by_name(elf, ".shstrtab"); + if (!shstrtab) { + shstrtab = find_section_by_name(elf, ".strtab"); + if (!shstrtab) { + ERROR("can't find .shstrtab or .strtab"); + return NULL; + } + } + sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); + if (sec->sh.sh_name == -1) + return NULL; + + elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name)); } - sec->sh.sh_name = elf_add_string(elf, shstrtab, sec->name); - if (sec->sh.sh_name == -1) - return NULL; +add: list_add_tail(&sec->list, &elf->sections); elf_hash_add(section, &sec->hash, sec->idx); - elf_hash_add(section_name, &sec->name_hash, str_hash(sec->name)); mark_sec_changed(elf, sec, true); return sec; } -static struct section *elf_create_rela_section(struct elf *elf, - struct section *sec, - unsigned int reloc_nr) +static int elf_alloc_reloc(struct elf *elf, struct section *rsec) +{ + struct reloc *old_relocs, *old_relocs_end, *new_relocs; + unsigned int nr_relocs_old = sec_num_entries(rsec); + unsigned int nr_relocs_new = nr_relocs_old + 1; + unsigned long nr_alloc; + struct symbol *sym; + + if (!rsec->data) { + rsec->data = elf_newdata(elf_getscn(elf->elf, rsec->idx)); + if (!rsec->data) { + ERROR_ELF("elf_newdata"); + return -1; + } + + rsec->data->d_align = 1; + rsec->data->d_type = ELF_T_RELA; + rsec->data->d_buf = NULL; + } + + rsec->data->d_size = nr_relocs_new * elf_rela_size(elf); + rsec->sh.sh_size = rsec->data->d_size; + + nr_alloc = MAX(64, ALIGN_UP_POW2(nr_relocs_new)); + if (nr_alloc <= rsec->nr_alloc_relocs) + return 0; + + if (rsec->data->d_buf && !rsec->nr_alloc_relocs) { + void *orig_buf = rsec->data->d_buf; + + /* + * The original d_buf is owned by libelf so it can't be + * realloced. + */ + rsec->data->d_buf = malloc(nr_alloc * elf_rela_size(elf)); + if (!rsec->data->d_buf) { + ERROR_GLIBC("malloc"); + return -1; + } + memcpy(rsec->data->d_buf, orig_buf, + nr_relocs_old * elf_rela_size(elf)); + } else { + rsec->data->d_buf = realloc(rsec->data->d_buf, + nr_alloc * elf_rela_size(elf)); + if (!rsec->data->d_buf) { + ERROR_GLIBC("realloc"); + return -1; + } + } + + rsec->nr_alloc_relocs = nr_alloc; + + old_relocs = rsec->relocs; + new_relocs = calloc(nr_alloc, sizeof(struct reloc)); + if (!new_relocs) { + ERROR_GLIBC("calloc"); + return -1; + } + + if (!old_relocs) + goto done; + + /* + * The struct reloc's address has changed. Update all the symbols and + * relocs which reference it. + */ + + old_relocs_end = &old_relocs[nr_relocs_old]; + for_each_sym(elf, sym) { + struct reloc *reloc; + + reloc = sym->relocs; + if (!reloc) + continue; + + if (reloc >= old_relocs && reloc < old_relocs_end) + sym->relocs = &new_relocs[reloc - old_relocs]; + + while (1) { + struct reloc *next_reloc = sym_next_reloc(reloc); + + if (!next_reloc) + break; + + if (next_reloc >= old_relocs && next_reloc < old_relocs_end) + set_sym_next_reloc(reloc, &new_relocs[next_reloc - old_relocs]); + + reloc = next_reloc; + } + } + + memcpy(new_relocs, old_relocs, nr_relocs_old * sizeof(struct reloc)); + + for (int i = 0; i < nr_relocs_old; i++) { + struct reloc *old = &old_relocs[i]; + struct reloc *new = &new_relocs[i]; + u32 key = reloc_hash(old); + + elf_hash_del(reloc, &old->hash, key); + elf_hash_add(reloc, &new->hash, key); + } + + free(old_relocs); +done: + rsec->relocs = new_relocs; + return 0; +} + +struct section *elf_create_rela_section(struct elf *elf, struct section *sec, + unsigned int nr_relocs) { struct section *rsec; char *rsec_name; @@ -1213,41 +1601,72 @@ static struct section *elf_create_rela_section(struct elf *elf, strcpy(rsec_name, ".rela"); strcat(rsec_name, sec->name); - rsec = elf_create_section(elf, rsec_name, elf_rela_size(elf), reloc_nr); + rsec = elf_create_section(elf, rsec_name, nr_relocs * elf_rela_size(elf), + elf_rela_size(elf), SHT_RELA, elf_addr_size(elf), + SHF_INFO_LINK); free(rsec_name); if (!rsec) return NULL; - rsec->data->d_type = ELF_T_RELA; - rsec->sh.sh_type = SHT_RELA; - rsec->sh.sh_addralign = elf_addr_size(elf); - rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; - rsec->sh.sh_info = sec->idx; - rsec->sh.sh_flags = SHF_INFO_LINK; + if (nr_relocs) { + rsec->data->d_type = ELF_T_RELA; - rsec->relocs = calloc(sec_num_entries(rsec), sizeof(struct reloc)); - if (!rsec->relocs) { - ERROR_GLIBC("calloc"); - return NULL; + rsec->nr_alloc_relocs = nr_relocs; + rsec->relocs = calloc(nr_relocs, sizeof(struct reloc)); + if (!rsec->relocs) { + ERROR_GLIBC("calloc"); + return NULL; + } } + rsec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; + rsec->sh.sh_info = sec->idx; + sec->rsec = rsec; rsec->base = sec; return rsec; } +struct reloc *elf_create_reloc(struct elf *elf, struct section *sec, + unsigned long offset, + struct symbol *sym, s64 addend, + unsigned int type) +{ + struct section *rsec = sec->rsec; + + if (!rsec) { + rsec = elf_create_rela_section(elf, sec, 0); + if (!rsec) + return NULL; + } + + if (find_reloc_by_dest(elf, sec, offset)) { + ERROR_FUNC(sec, offset, "duplicate reloc"); + return NULL; + } + + if (elf_alloc_reloc(elf, rsec)) + return NULL; + + mark_sec_changed(elf, rsec, true); + + return elf_init_reloc(elf, rsec, sec_num_entries(rsec) - 1, offset, sym, + addend, type); +} + struct section *elf_create_section_pair(struct elf *elf, const char *name, size_t entsize, unsigned int nr, - unsigned int reloc_nr) + unsigned int nr_relocs) { struct section *sec; - sec = elf_create_section(elf, name, entsize, nr); + sec = elf_create_section(elf, name, nr * entsize, entsize, + SHT_PROGBITS, 1, SHF_ALLOC); if (!sec) return NULL; - if (!elf_create_rela_section(elf, sec, reloc_nr)) + if (!elf_create_rela_section(elf, sec, nr_relocs)) return NULL; return sec; @@ -1282,7 +1701,7 @@ int elf_write_insn(struct elf *elf, struct section *sec, */ static int elf_truncate_section(struct elf *elf, struct section *sec) { - u64 size = sec->sh.sh_size; + u64 size = sec_size(sec); bool truncated = false; Elf_Data *data = NULL; Elf_Scn *s; @@ -1296,7 +1715,6 @@ static int elf_truncate_section(struct elf *elf, struct section *sec) for (;;) { /* get next data descriptor for the relevant section */ data = elf_getdata(s, data); - if (!data) { if (size) { ERROR("end of section data but non-zero size left\n"); @@ -1332,8 +1750,8 @@ int elf_write(struct elf *elf) /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { - if (sec->truncate) - elf_truncate_section(elf, sec); + if (sec->truncate && elf_truncate_section(elf, sec)) + return -1; if (sec_changed(sec)) { s = elf_getscn(elf->elf, sec->idx); @@ -1366,7 +1784,7 @@ int elf_write(struct elf *elf) return 0; } -void elf_close(struct elf *elf) +int elf_close(struct elf *elf) { if (elf->elf) elf_end(elf->elf); @@ -1374,8 +1792,12 @@ void elf_close(struct elf *elf) if (elf->fd > 0) close(elf->fd); + if (elf->tmp_name && rename(elf->tmp_name, elf->name)) + return -1; + /* * NOTE: All remaining allocations are leaked on purpose. Objtool is * about to exit anyway. */ + return 0; } diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index be33c7b43180..8866158975fc 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -71,7 +71,7 @@ struct stack_op { struct instruction; -int arch_ftrace_match(char *name); +int arch_ftrace_match(const char *name); void arch_initial_func_cfi_state(struct cfi_init_state *state); @@ -83,7 +83,8 @@ bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); -unsigned long arch_dest_reloc_offset(int addend); +s64 arch_insn_adjusted_addend(struct instruction *insn, struct reloc *reloc); +u64 arch_adjusted_addend(struct reloc *reloc); const char *arch_nop_insn(int len); const char *arch_ret_insn(int len); @@ -102,4 +103,15 @@ bool arch_absolute_reloc(struct elf *elf, struct reloc *reloc); unsigned int arch_reloc_size(struct reloc *reloc); unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); +extern const char *arch_reg_name[CFI_NUM_REGS]; + +#ifdef DISAS + +#include <bfd.h> +#include <dis-asm.h> + +int arch_disas_info_init(struct disassemble_info *dinfo); + +#endif /* DISAS */ + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index ab22673862e1..b9e229ed4dc0 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -9,12 +9,15 @@ struct opts { /* actions: */ + bool cfi; + bool checksum; bool dump_orc; bool hack_jump_label; bool hack_noinstr; bool hack_skylake; bool ibt; bool mcount; + bool noabs; bool noinstr; bool orc; bool retpoline; @@ -25,11 +28,12 @@ struct opts { bool static_call; bool uaccess; int prefix; - bool cfi; - bool noabs; + const char *disas; /* options: */ bool backtrace; + bool backup; + const char *debug_checksum; bool dryrun; bool link; bool mnop; @@ -38,8 +42,10 @@ struct opts { const char *output; bool sec_address; bool stats; + const char *trace; bool verbose; bool werror; + bool wide; }; extern struct opts opts; @@ -48,6 +54,8 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]); int objtool_run(int argc, const char **argv); -void print_args(void); +int make_backup(void); + +int cmd_klp(int argc, const char **argv); #endif /* _BUILTIN_H */ diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 00fb745e7233..2e1346ad5e92 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -36,6 +36,19 @@ struct alt_group { struct cfi_state **cfi; bool ignore; + unsigned int feature; +}; + +enum alternative_type { + ALT_TYPE_INSTRUCTIONS, + ALT_TYPE_JUMP_TABLE, + ALT_TYPE_EX_TABLE, +}; + +struct alternative { + struct alternative *next; + struct instruction *insn; + enum alternative_type type; }; #define INSN_CHUNK_BITS 8 @@ -64,8 +77,11 @@ struct instruction { noendbr : 1, unret : 1, visited : 4, - no_reloc : 1; - /* 10 bit hole */ + no_reloc : 1, + hole : 1, + fake : 1, + trace : 1; + /* 9 bit hole */ struct alt_group *alt_group; struct instruction *jump_dest; @@ -115,6 +131,15 @@ static inline bool is_jump(struct instruction *insn) return is_static_jump(insn) || is_dynamic_jump(insn); } +static inline struct symbol *insn_call_dest(struct instruction *insn) +{ + if (insn->type == INSN_JUMP_DYNAMIC || + insn->type == INSN_CALL_DYNAMIC) + return NULL; + + return insn->_call_dest; +} + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); @@ -125,4 +150,14 @@ struct instruction *next_insn_same_sec(struct objtool_file *file, struct instruc insn && insn->sec == _sec; \ insn = next_insn_same_sec(file, insn)) +#define sym_for_each_insn(file, sym, insn) \ + for (insn = find_insn(file, sym->sec, sym->offset); \ + insn && insn->offset < sym->offset + sym->len; \ + insn = next_insn_same_sec(file, insn)) + +const char *objtool_disas_insn(struct instruction *insn); + +extern size_t sym_name_max_len; +extern struct disas_context *objtool_disas_ctx; + #endif /* _CHECK_H */ diff --git a/tools/objtool/include/objtool/checksum.h b/tools/objtool/include/objtool/checksum.h new file mode 100644 index 000000000000..7fe21608722a --- /dev/null +++ b/tools/objtool/include/objtool/checksum.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_CHECKSUM_H +#define _OBJTOOL_CHECKSUM_H + +#include <objtool/elf.h> + +#ifdef BUILD_KLP + +static inline void checksum_init(struct symbol *func) +{ + if (func && !func->csum.state) { + func->csum.state = XXH3_createState(); + XXH3_64bits_reset(func->csum.state); + } +} + +static inline void checksum_update(struct symbol *func, + struct instruction *insn, + const void *data, size_t size) +{ + XXH3_64bits_update(func->csum.state, data, size); + dbg_checksum(func, insn, XXH3_64bits_digest(func->csum.state)); +} + +static inline void checksum_finish(struct symbol *func) +{ + if (func && func->csum.state) { + func->csum.checksum = XXH3_64bits_digest(func->csum.state); + func->csum.state = NULL; + } +} + +#else /* !BUILD_KLP */ + +static inline void checksum_init(struct symbol *func) {} +static inline void checksum_update(struct symbol *func, + struct instruction *insn, + const void *data, size_t size) {} +static inline void checksum_finish(struct symbol *func) {} + +#endif /* !BUILD_KLP */ + +#endif /* _OBJTOOL_CHECKSUM_H */ diff --git a/tools/objtool/include/objtool/checksum_types.h b/tools/objtool/include/objtool/checksum_types.h new file mode 100644 index 000000000000..507efdd8ab5b --- /dev/null +++ b/tools/objtool/include/objtool/checksum_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _OBJTOOL_CHECKSUM_TYPES_H +#define _OBJTOOL_CHECKSUM_TYPES_H + +struct sym_checksum { + u64 addr; + u64 checksum; +}; + +#ifdef BUILD_KLP + +#include <xxhash.h> + +struct checksum { + XXH3_state_t *state; + XXH64_hash_t checksum; +}; + +#else /* !BUILD_KLP */ + +struct checksum {}; + +#endif /* !BUILD_KLP */ + +#endif /* _OBJTOOL_CHECKSUM_TYPES_H */ diff --git a/tools/objtool/include/objtool/disas.h b/tools/objtool/include/objtool/disas.h new file mode 100644 index 000000000000..e8f395eff159 --- /dev/null +++ b/tools/objtool/include/objtool/disas.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _DISAS_H +#define _DISAS_H + +struct alternative; +struct disas_context; +struct disassemble_info; + +#ifdef DISAS + +struct disas_context *disas_context_create(struct objtool_file *file); +void disas_context_destroy(struct disas_context *dctx); +void disas_warned_funcs(struct disas_context *dctx); +void disas_funcs(struct disas_context *dctx); +int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options); +size_t disas_insn(struct disas_context *dctx, struct instruction *insn); +char *disas_result(struct disas_context *dctx); +void disas_print_info(FILE *stream, struct instruction *insn, int depth, + const char *format, ...); +void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...); +char *disas_alt_name(struct alternative *alt); +const char *disas_alt_type_name(struct instruction *insn); + +#else /* DISAS */ + +#include <objtool/warn.h> + +static inline struct disas_context *disas_context_create(struct objtool_file *file) +{ + WARN("Rebuild with libopcodes for disassembly support"); + return NULL; +} + +static inline void disas_context_destroy(struct disas_context *dctx) {} +static inline void disas_warned_funcs(struct disas_context *dctx) {} +static inline void disas_funcs(struct disas_context *dctx) {} + +static inline int disas_info_init(struct disassemble_info *dinfo, + int arch, int mach32, int mach64, + const char *options) +{ + return -1; +} + +static inline size_t disas_insn(struct disas_context *dctx, + struct instruction *insn) +{ + return -1; +} + +static inline char *disas_result(struct disas_context *dctx) +{ + return NULL; +} + +static inline void disas_print_info(FILE *stream, struct instruction *insn, + int depth, const char *format, ...) {} +static inline void disas_print_insn(FILE *stream, struct disas_context *dctx, + struct instruction *insn, int depth, + const char *format, ...) {} +static inline char *disas_alt_name(struct alternative *alt) +{ + return NULL; +} + +static inline const char *disas_alt_type_name(struct instruction *insn) +{ + return NULL; +} + +#endif /* DISAS */ + +#endif /* _DISAS_H */ diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index df8434d3b744..e12c516bd320 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -8,12 +8,21 @@ #include <stdio.h> #include <gelf.h> +#include <linux/string.h> #include <linux/list.h> #include <linux/hashtable.h> #include <linux/rbtree.h> #include <linux/jhash.h> + +#include <objtool/endianness.h> +#include <objtool/checksum_types.h> #include <arch/elf.h> +#define SEC_NAME_LEN 1024 +#define SYM_NAME_LEN 512 + +#define bswap_if_needed(elf, val) __bswap_if_needed(&elf->ehdr, val) + #ifdef LIBELF_USE_DEPRECATED # define elf_getshdrnum elf_getshnum # define elf_getshdrstrndx elf_getshstrndx @@ -40,24 +49,27 @@ struct section { struct section *base, *rsec; struct symbol *sym; Elf_Data *data; - char *name; + const char *name; int idx; bool _changed, text, rodata, noinstr, init, truncate; struct reloc *relocs; + unsigned long nr_alloc_relocs; + struct section *twin; }; struct symbol { struct list_head list; + struct list_head global_list; struct rb_node node; struct elf_hash_node hash; struct elf_hash_node name_hash; GElf_Sym sym; struct section *sec; - char *name; + const char *name, *demangled_name; unsigned int idx, len; unsigned long offset; unsigned long __subtree_last; - struct symbol *pfunc, *cfunc, *alias; + struct symbol *pfunc, *cfunc, *alias, *file; unsigned char bind, type; u8 uaccess_safe : 1; u8 static_call_tramp : 1; @@ -71,9 +83,17 @@ struct symbol { u8 frame_pointer : 1; u8 ignore : 1; u8 nocfi : 1; + u8 cold : 1; + u8 prefix : 1; + u8 debug_checksum : 1; + u8 changed : 1; + u8 included : 1; + u8 klp : 1; struct list_head pv_target; struct reloc *relocs; struct section *group_sec; + struct checksum csum; + struct symbol *twin, *clone; }; struct reloc { @@ -88,9 +108,10 @@ struct elf { GElf_Ehdr ehdr; int fd; bool changed; - char *name; + const char *name, *tmp_name; unsigned int num_files; struct list_head sections; + struct list_head symbols; unsigned long num_relocs; int symbol_bits; @@ -110,14 +131,37 @@ struct elf { }; struct elf *elf_open_read(const char *name, int flags); +struct elf *elf_create_file(GElf_Ehdr *ehdr, const char *name); struct section *elf_create_section(struct elf *elf, const char *name, - size_t entsize, unsigned int nr); + size_t size, size_t entsize, + unsigned int type, unsigned int align, + unsigned int flags); struct section *elf_create_section_pair(struct elf *elf, const char *name, size_t entsize, unsigned int nr, unsigned int reloc_nr); -struct symbol *elf_create_prefix_symbol(struct elf *elf, struct symbol *orig, long size); +struct section *elf_create_rela_section(struct elf *elf, struct section *sec, + unsigned int reloc_nr); + +struct symbol *elf_create_symbol(struct elf *elf, const char *name, + struct section *sec, unsigned int bind, + unsigned int type, unsigned long offset, + size_t size); +struct symbol *elf_create_section_symbol(struct elf *elf, struct section *sec); + +void *elf_add_data(struct elf *elf, struct section *sec, const void *data, + size_t size); + +unsigned int elf_add_string(struct elf *elf, struct section *strtab, const char *str); + +struct reloc *elf_create_reloc(struct elf *elf, struct section *sec, + unsigned long offset, struct symbol *sym, + s64 addend, unsigned int type); + +struct reloc *elf_init_reloc(struct elf *elf, struct section *rsec, + unsigned int reloc_idx, unsigned long offset, + struct symbol *sym, s64 addend, unsigned int type); struct reloc *elf_init_reloc_text_sym(struct elf *elf, struct section *sec, unsigned long offset, @@ -131,16 +175,17 @@ struct reloc *elf_init_reloc_data_sym(struct elf *elf, struct section *sec, struct symbol *sym, s64 addend); -int elf_write_insn(struct elf *elf, struct section *sec, - unsigned long offset, unsigned int len, - const char *insn); +int elf_write_insn(struct elf *elf, struct section *sec, unsigned long offset, + unsigned int len, const char *insn); + int elf_write(struct elf *elf); -void elf_close(struct elf *elf); +int elf_close(struct elf *elf); struct section *find_section_by_name(const struct elf *elf, const char *name); struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); +struct symbol *find_global_symbol_by_name(const struct elf *elf, const char *name); struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); int find_symbol_hole_containing(const struct section *sec, unsigned long offset); struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); @@ -178,11 +223,76 @@ static inline unsigned int elf_text_rela_type(struct elf *elf) return elf_addr_size(elf) == 4 ? R_TEXT32 : R_TEXT64; } +static inline bool is_undef_sym(struct symbol *sym) +{ + return !sym->sec->idx; +} + +static inline bool is_null_sym(struct symbol *sym) +{ + return !sym->idx; +} + +static inline bool is_sec_sym(struct symbol *sym) +{ + return sym->type == STT_SECTION; +} + +static inline bool is_object_sym(struct symbol *sym) +{ + return sym->type == STT_OBJECT; +} + +static inline bool is_func_sym(struct symbol *sym) +{ + return sym->type == STT_FUNC; +} + +static inline bool is_file_sym(struct symbol *sym) +{ + return sym->type == STT_FILE; +} + +static inline bool is_notype_sym(struct symbol *sym) +{ + return sym->type == STT_NOTYPE; +} + +static inline bool is_global_sym(struct symbol *sym) +{ + return sym->bind == STB_GLOBAL; +} + +static inline bool is_weak_sym(struct symbol *sym) +{ + return sym->bind == STB_WEAK; +} + +static inline bool is_local_sym(struct symbol *sym) +{ + return sym->bind == STB_LOCAL; +} + +static inline bool is_prefix_func(struct symbol *sym) +{ + return sym->prefix; +} + static inline bool is_reloc_sec(struct section *sec) { return sec->sh.sh_type == SHT_RELA || sec->sh.sh_type == SHT_REL; } +static inline bool is_string_sec(struct section *sec) +{ + return sec->sh.sh_flags & SHF_STRINGS; +} + +static inline bool is_text_sec(struct section *sec) +{ + return sec->sh.sh_flags & SHF_EXECINSTR; +} + static inline bool sec_changed(struct section *sec) { return sec->_changed; @@ -223,6 +333,11 @@ static inline bool is_32bit_reloc(struct reloc *reloc) return reloc->sec->sh.sh_entsize < 16; } +static inline unsigned long sec_size(struct section *sec) +{ + return sec->sh.sh_size; +} + #define __get_reloc_field(reloc, field) \ ({ \ is_32bit_reloc(reloc) ? \ @@ -300,6 +415,15 @@ static inline void set_reloc_type(struct elf *elf, struct reloc *reloc, unsigned mark_sec_changed(elf, reloc->sec, true); } +static inline unsigned int annotype(struct elf *elf, struct section *sec, + struct reloc *reloc) +{ + unsigned int type; + + type = *(u32 *)(sec->data->d_buf + (reloc_idx(reloc) * 8) + 4); + return bswap_if_needed(elf, type); +} + #define RELOC_JUMP_TABLE_BIT 1UL /* Does reloc mark the beginning of a jump table? */ @@ -325,28 +449,54 @@ static inline void set_sym_next_reloc(struct reloc *reloc, struct reloc *next) reloc->_sym_next_reloc = (unsigned long)next | bit; } -#define for_each_sec(file, sec) \ - list_for_each_entry(sec, &file->elf->sections, list) +#define for_each_sec(elf, sec) \ + list_for_each_entry(sec, &elf->sections, list) #define sec_for_each_sym(sec, sym) \ list_for_each_entry(sym, &sec->symbol_list, list) -#define for_each_sym(file, sym) \ - for (struct section *__sec, *__fake = (struct section *)1; \ - __fake; __fake = NULL) \ - for_each_sec(file, __sec) \ - sec_for_each_sym(__sec, sym) +#define sec_prev_sym(sym) \ + sym->sec && sym->list.prev != &sym->sec->symbol_list ? \ + list_prev_entry(sym, list) : NULL + +#define for_each_sym(elf, sym) \ + list_for_each_entry(sym, &elf->symbols, global_list) + +#define for_each_sym_continue(elf, sym) \ + list_for_each_entry_continue(sym, &elf->symbols, global_list) + +#define rsec_next_reloc(rsec, reloc) \ + reloc_idx(reloc) < sec_num_entries(rsec) - 1 ? reloc + 1 : NULL #define for_each_reloc(rsec, reloc) \ - for (int __i = 0, __fake = 1; __fake; __fake = 0) \ - for (reloc = rsec->relocs; \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (reloc = rsec->relocs; reloc; reloc = rsec_next_reloc(rsec, reloc)) #define for_each_reloc_from(rsec, reloc) \ - for (int __i = reloc_idx(reloc); \ - __i < sec_num_entries(rsec); \ - __i++, reloc++) + for (; reloc; reloc = rsec_next_reloc(rsec, reloc)) + +#define for_each_reloc_continue(rsec, reloc) \ + for (reloc = rsec_next_reloc(rsec, reloc); reloc; \ + reloc = rsec_next_reloc(rsec, reloc)) + +#define sym_for_each_reloc(elf, sym, reloc) \ + for (reloc = find_reloc_by_dest_range(elf, sym->sec, \ + sym->offset, sym->len); \ + reloc && reloc_offset(reloc) < sym->offset + sym->len; \ + reloc = rsec_next_reloc(sym->sec->rsec, reloc)) + +static inline struct symbol *get_func_prefix(struct symbol *func) +{ + struct symbol *prev; + + if (!is_func_sym(func)) + return NULL; + + prev = sec_prev_sym(func); + if (prev && is_prefix_func(prev)) + return prev; + + return NULL; +} #define OFFSET_STRIDE_BITS 4 #define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS) diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h index 4d2aa9b0fe2f..aebcd2338668 100644 --- a/tools/objtool/include/objtool/endianness.h +++ b/tools/objtool/include/objtool/endianness.h @@ -4,7 +4,6 @@ #include <linux/kernel.h> #include <endian.h> -#include <objtool/elf.h> /* * Does a byte swap if target file endianness doesn't match the host, i.e. cross @@ -12,16 +11,16 @@ * To be used for multi-byte values conversion, which are read from / about * to be written to a target native endianness ELF file. */ -static inline bool need_bswap(struct elf *elf) +static inline bool need_bswap(GElf_Ehdr *ehdr) { return (__BYTE_ORDER == __LITTLE_ENDIAN) ^ - (elf->ehdr.e_ident[EI_DATA] == ELFDATA2LSB); + (ehdr->e_ident[EI_DATA] == ELFDATA2LSB); } -#define bswap_if_needed(elf, val) \ +#define __bswap_if_needed(ehdr, val) \ ({ \ __typeof__(val) __ret; \ - bool __need_bswap = need_bswap(elf); \ + bool __need_bswap = need_bswap(ehdr); \ switch (sizeof(val)) { \ case 8: \ __ret = __need_bswap ? bswap_64(val) : (val); break; \ diff --git a/tools/objtool/include/objtool/klp.h b/tools/objtool/include/objtool/klp.h new file mode 100644 index 000000000000..ad830a7ce55b --- /dev/null +++ b/tools/objtool/include/objtool/klp.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_KLP_H +#define _OBJTOOL_KLP_H + +#define SHF_RELA_LIVEPATCH 0x00100000 +#define SHN_LIVEPATCH 0xff20 + +/* + * __klp_objects and __klp_funcs are created by klp diff and used by the patch + * module init code to build the klp_patch, klp_object and klp_func structs + * needed by the livepatch API. + */ +#define KLP_OBJECTS_SEC "__klp_objects" +#define KLP_FUNCS_SEC "__klp_funcs" + +/* + * __klp_relocs is an intermediate section which are created by klp diff and + * converted into KLP symbols/relas by "objtool klp post-link". This is needed + * to work around the linker, which doesn't preserve SHN_LIVEPATCH or + * SHF_RELA_LIVEPATCH, nor does it support having two RELA sections for a + * single PROGBITS section. + */ +#define KLP_RELOCS_SEC "__klp_relocs" +#define KLP_STRINGS_SEC ".rodata.klp.str1.1" + +struct klp_reloc { + void *offset; + void *sym; + u32 type; +}; + +int cmd_klp_diff(int argc, const char **argv); +int cmd_klp_post_link(int argc, const char **argv); + +#endif /* _OBJTOOL_KLP_H */ diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h index c0dc86a78ff6..f7051bbe0bcb 100644 --- a/tools/objtool/include/objtool/objtool.h +++ b/tools/objtool/include/objtool/objtool.h @@ -28,7 +28,7 @@ struct objtool_file { struct list_head mcount_loc_list; struct list_head endbr_list; struct list_head call_list; - bool ignore_unreachables, hints, rodata; + bool ignore_unreachables, hints, rodata, klp; unsigned int nr_endbr; unsigned int nr_endbr_int; @@ -39,6 +39,8 @@ struct objtool_file { struct pv_state *pv_ops; }; +char *top_level_dir(const char *file); + struct objtool_file *objtool_open_read(const char *_objname); int objtool_pv_add(struct objtool_file *file, int idx, struct symbol *func); diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index 72d09c0adf1a..121c3761899c 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -25,7 +25,7 @@ struct special_alt { struct section *new_sec; unsigned long new_off; - unsigned int orig_len, new_len; /* group only */ + unsigned int orig_len, new_len, feature; /* group only */ }; int special_get_alts(struct elf *elf, struct list_head *alts); @@ -38,4 +38,6 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn, unsigned long *table_size); +const char *arch_cpu_feature_name(int feature_number); + #endif /* _SPECIAL_H */ diff --git a/tools/objtool/include/objtool/trace.h b/tools/objtool/include/objtool/trace.h new file mode 100644 index 000000000000..70b574366797 --- /dev/null +++ b/tools/objtool/include/objtool/trace.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#ifndef _TRACE_H +#define _TRACE_H + +#include <objtool/check.h> +#include <objtool/disas.h> + +#ifdef DISAS + +extern bool trace; +extern int trace_depth; + +#define TRACE(fmt, ...) \ +({ if (trace) \ + fprintf(stderr, fmt, ##__VA_ARGS__); \ +}) + +/* + * Print the instruction address and a message. The instruction + * itself is not printed. + */ +#define TRACE_ADDR(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_info(stderr, insn, trace_depth - 1, \ + fmt "\n", ##__VA_ARGS__); \ + } \ +}) + +/* + * Print the instruction address, the instruction and a message. + */ +#define TRACE_INSN(insn, fmt, ...) \ +({ \ + if (trace) { \ + disas_print_insn(stderr, objtool_disas_ctx, \ + insn, trace_depth - 1, \ + fmt, ##__VA_ARGS__); \ + fprintf(stderr, "\n"); \ + insn->trace = 1; \ + } \ +}) + +#define TRACE_INSN_STATE(insn, sprev, snext) \ +({ \ + if (trace) \ + trace_insn_state(insn, sprev, snext); \ +}) + +#define TRACE_ALT_FMT(pfx, fmt) pfx "<%s.%lx> " fmt +#define TRACE_ALT_ARG(insn) disas_alt_type_name(insn), (insn)->offset + +#define TRACE_ALT(insn, fmt, ...) \ + TRACE_INSN(insn, TRACE_ALT_FMT("", fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO(insn, pfx, fmt, ...) \ + TRACE_ADDR(insn, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_INFO_NOADDR(insn, pfx, fmt, ...) \ + TRACE_ADDR(NULL, TRACE_ALT_FMT(pfx, fmt), \ + TRACE_ALT_ARG(insn), ##__VA_ARGS__) + +#define TRACE_ALT_BEGIN(insn, alt, alt_name) \ +({ \ + if (trace) { \ + alt_name = disas_alt_name(alt); \ + trace_alt_begin(insn, alt, alt_name); \ + } \ +}) + +#define TRACE_ALT_END(insn, alt, alt_name) \ +({ \ + if (trace) { \ + trace_alt_end(insn, alt, alt_name); \ + free(alt_name); \ + } \ +}) + +static inline void trace_enable(void) +{ + trace = true; + trace_depth = 0; +} + +static inline void trace_disable(void) +{ + trace = false; +} + +static inline void trace_depth_inc(void) +{ + if (trace) + trace_depth++; +} + +static inline void trace_depth_dec(void) +{ + if (trace) + trace_depth--; +} + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext); +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name); + +#else /* DISAS */ + +#define TRACE(fmt, ...) ({}) +#define TRACE_ADDR(insn, fmt, ...) ({}) +#define TRACE_INSN(insn, fmt, ...) ({}) +#define TRACE_INSN_STATE(insn, sprev, snext) ({}) +#define TRACE_ALT(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO(insn, fmt, ...) ({}) +#define TRACE_ALT_INFO_NOADDR(insn, fmt, ...) ({}) +#define TRACE_ALT_BEGIN(insn, alt, alt_name) ({}) +#define TRACE_ALT_END(insn, alt, alt_name) ({}) + + +static inline void trace_enable(void) {} +static inline void trace_disable(void) {} +static inline void trace_depth_inc(void) {} +static inline void trace_depth_dec(void) {} +static inline void trace_alt_begin(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; +static inline void trace_alt_end(struct instruction *orig_insn, + struct alternative *alt, + char *alt_name) {}; + +#endif + +#endif /* _TRACE_H */ diff --git a/tools/objtool/include/objtool/util.h b/tools/objtool/include/objtool/util.h new file mode 100644 index 000000000000..a0180b312f73 --- /dev/null +++ b/tools/objtool/include/objtool/util.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _UTIL_H +#define _UTIL_H + +#include <objtool/warn.h> + +#define snprintf_check(str, size, format, args...) \ +({ \ + int __ret = snprintf(str, size, format, args); \ + if (__ret < 0) \ + ERROR_GLIBC("snprintf"); \ + else if (__ret >= size) \ + ERROR("snprintf() failed for '" format "'", args); \ + else \ + __ret = 0; \ + __ret; \ +}) + +#endif /* _UTIL_H */ diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index cb8fe846d9dd..25ff7942b4d5 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -77,9 +77,11 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define WARN_INSN(insn, format, ...) \ ({ \ struct instruction *_insn = (insn); \ - if (!_insn->sym || !_insn->sym->warned) \ + if (!_insn->sym || !_insn->sym->warned) { \ WARN_FUNC(_insn->sec, _insn->offset, format, \ ##__VA_ARGS__); \ + BT_INSN(_insn, ""); \ + } \ if (_insn->sym) \ _insn->sym->warned = 1; \ }) @@ -87,10 +89,15 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define BT_INSN(insn, format, ...) \ ({ \ if (opts.verbose || opts.backtrace) { \ - struct instruction *_insn = (insn); \ - char *_str = offstr(_insn->sec, _insn->offset); \ - WARN(" %s: " format, _str, ##__VA_ARGS__); \ - free(_str); \ + struct instruction *__insn = (insn); \ + char *_str = offstr(__insn->sec, __insn->offset); \ + const char *_istr = objtool_disas_insn(__insn); \ + int _len; \ + _len = snprintf(NULL, 0, " %s: " format, _str, ##__VA_ARGS__); \ + _len = (_len < 50) ? 50 - _len : 0; \ + WARN(" %s: " format " %*s%s", _str, ##__VA_ARGS__, _len, "", _istr); \ + free(_str); \ + __insn->trace = 1; \ } \ }) @@ -102,4 +109,53 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define ERROR_FUNC(sec, offset, format, ...) __WARN_FUNC(ERROR_STR, sec, offset, format, ##__VA_ARGS__) #define ERROR_INSN(insn, format, ...) WARN_FUNC(insn->sec, insn->offset, format, ##__VA_ARGS__) +extern bool debug; +extern int indent; + +static inline void unindent(int *unused) { indent--; } + +/* + * Clang prior to 17 is being silly and considers many __cleanup() variables + * as unused (because they are, their sole purpose is to go out of scope). + * + * https://github.com/llvm/llvm-project/commit/877210faa447f4cc7db87812f8ed80e398fedd61 + */ +#undef __cleanup +#define __cleanup(func) __maybe_unused __attribute__((__cleanup__(func))) + +#define __dbg(format, ...) \ + fprintf(stderr, \ + "DEBUG: %s%s" format "\n", \ + objname ?: "", \ + objname ? ": " : "", \ + ##__VA_ARGS__) + +#define dbg(args...) \ +({ \ + if (unlikely(debug)) \ + __dbg(args); \ +}) + +#define __dbg_indent(format, ...) \ +({ \ + if (unlikely(debug)) \ + __dbg("%*s" format, indent * 8, "", ##__VA_ARGS__); \ +}) + +#define dbg_indent(args...) \ + int __cleanup(unindent) __dummy_##__COUNTER__; \ + __dbg_indent(args); \ + indent++ + +#define dbg_checksum(func, insn, checksum) \ +({ \ + if (unlikely(insn->sym && insn->sym->pfunc && \ + insn->sym->pfunc->debug_checksum)) { \ + char *insn_off = offstr(insn->sec, insn->offset); \ + __dbg("checksum: %s %s %016lx", \ + func->name, insn_off, checksum); \ + free(insn_off); \ + } \ +}) + #endif /* _WARN_H */ diff --git a/tools/objtool/klp-diff.c b/tools/objtool/klp-diff.c new file mode 100644 index 000000000000..4d1f9e9977eb --- /dev/null +++ b/tools/objtool/klp-diff.c @@ -0,0 +1,1723 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#define _GNU_SOURCE /* memmem() */ +#include <subcmd/parse-options.h> +#include <stdlib.h> +#include <string.h> +#include <libgen.h> +#include <stdio.h> +#include <ctype.h> + +#include <objtool/objtool.h> +#include <objtool/warn.h> +#include <objtool/arch.h> +#include <objtool/klp.h> +#include <objtool/util.h> +#include <arch/special.h> + +#include <linux/objtool_types.h> +#include <linux/livepatch_external.h> +#include <linux/stringify.h> +#include <linux/string.h> +#include <linux/jhash.h> + +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) + +struct elfs { + struct elf *orig, *patched, *out; + const char *modname; +}; + +struct export { + struct hlist_node hash; + char *mod, *sym; +}; + +static const char * const klp_diff_usage[] = { + "objtool klp diff [<options>] <in1.o> <in2.o> <out.o>", + NULL, +}; + +static const struct option klp_diff_options[] = { + OPT_GROUP("Options:"), + OPT_BOOLEAN('d', "debug", &debug, "enable debug output"), + OPT_END(), +}; + +static DEFINE_HASHTABLE(exports, 15); + +static inline u32 str_hash(const char *str) +{ + return jhash(str, strlen(str), 0); +} + +static char *escape_str(const char *orig) +{ + size_t len = 0; + const char *a; + char *b, *new; + + for (a = orig; *a; a++) { + switch (*a) { + case '\001': len += 5; break; + case '\n': + case '\t': len += 2; break; + default: len++; + } + } + + new = malloc(len + 1); + if (!new) + return NULL; + + for (a = orig, b = new; *a; a++) { + switch (*a) { + case '\001': memcpy(b, "<SOH>", 5); b += 5; break; + case '\n': *b++ = '\\'; *b++ = 'n'; break; + case '\t': *b++ = '\\'; *b++ = 't'; break; + default: *b++ = *a; + } + } + + *b = '\0'; + return new; +} + +static int read_exports(void) +{ + const char *symvers = "Module.symvers"; + char line[1024], *path = NULL; + unsigned int line_num = 1; + FILE *file; + + file = fopen(symvers, "r"); + if (!file) { + path = top_level_dir(symvers); + if (!path) { + ERROR("can't open '%s', \"objtool diff\" should be run from the kernel tree", symvers); + return -1; + } + + file = fopen(path, "r"); + if (!file) { + ERROR_GLIBC("fopen"); + return -1; + } + } + + while (fgets(line, 1024, file)) { + char *sym, *mod, *type; + struct export *export; + + sym = strchr(line, '\t'); + if (!sym) { + ERROR("malformed Module.symvers (sym) at line %d", line_num); + return -1; + } + + *sym++ = '\0'; + + mod = strchr(sym, '\t'); + if (!mod) { + ERROR("malformed Module.symvers (mod) at line %d", line_num); + return -1; + } + + *mod++ = '\0'; + + type = strchr(mod, '\t'); + if (!type) { + ERROR("malformed Module.symvers (type) at line %d", line_num); + return -1; + } + + *type++ = '\0'; + + if (*sym == '\0' || *mod == '\0') { + ERROR("malformed Module.symvers at line %d", line_num); + return -1; + } + + export = calloc(1, sizeof(*export)); + if (!export) { + ERROR_GLIBC("calloc"); + return -1; + } + + export->mod = strdup(mod); + if (!export->mod) { + ERROR_GLIBC("strdup"); + return -1; + } + + export->sym = strdup(sym); + if (!export->sym) { + ERROR_GLIBC("strdup"); + return -1; + } + + hash_add(exports, &export->hash, str_hash(sym)); + } + + free(path); + fclose(file); + + return 0; +} + +static int read_sym_checksums(struct elf *elf) +{ + struct section *sec; + + sec = find_section_by_name(elf, ".discard.sym_checksum"); + if (!sec) { + ERROR("'%s' missing .discard.sym_checksum section, file not processed by 'objtool --checksum'?", + elf->name); + return -1; + } + + if (!sec->rsec) { + ERROR("missing reloc section for .discard.sym_checksum"); + return -1; + } + + if (sec_size(sec) % sizeof(struct sym_checksum)) { + ERROR("struct sym_checksum size mismatch"); + return -1; + } + + for (int i = 0; i < sec_size(sec) / sizeof(struct sym_checksum); i++) { + struct sym_checksum *sym_checksum; + struct reloc *reloc; + struct symbol *sym; + + sym_checksum = (struct sym_checksum *)sec->data->d_buf + i; + + reloc = find_reloc_by_dest(elf, sec, i * sizeof(*sym_checksum)); + if (!reloc) { + ERROR("can't find reloc for sym_checksum[%d]", i); + return -1; + } + + sym = reloc->sym; + + if (is_sec_sym(sym)) { + ERROR("not sure how to handle section %s", sym->name); + return -1; + } + + if (is_func_sym(sym)) + sym->csum.checksum = sym_checksum->checksum; + } + + return 0; +} + +static struct symbol *first_file_symbol(struct elf *elf) +{ + struct symbol *sym; + + for_each_sym(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +static struct symbol *next_file_symbol(struct elf *elf, struct symbol *sym) +{ + for_each_sym_continue(elf, sym) { + if (is_file_sym(sym)) + return sym; + } + + return NULL; +} + +/* + * Certain static local variables should never be correlated. They will be + * used in place rather than referencing the originals. + */ +static bool is_uncorrelated_static_local(struct symbol *sym) +{ + static const char * const vars[] = { + "__already_done.", + "__func__.", + "__key.", + "__warned.", + "_entry.", + "_entry_ptr.", + "_rs.", + "descriptor.", + "CSWTCH.", + }; + + if (!is_object_sym(sym) || !is_local_sym(sym)) + return false; + + if (!strcmp(sym->sec->name, ".data.once")) + return true; + + for (int i = 0; i < ARRAY_SIZE(vars); i++) { + if (strstarts(sym->name, vars[i])) + return true; + } + + return false; +} + +/* + * Clang emits several useless .Ltmp_* code labels. + */ +static bool is_clang_tmp_label(struct symbol *sym) +{ + return sym->type == STT_NOTYPE && + is_text_sec(sym->sec) && + strstarts(sym->name, ".Ltmp") && + isdigit(sym->name[5]); +} + +static bool is_special_section(struct section *sec) +{ + static const char * const specials[] = { + ".altinstructions", + ".smp_locks", + "__bug_table", + "__ex_table", + "__jump_table", + "__mcount_loc", + + /* + * Extract .static_call_sites here to inherit non-module + * preferential treatment. The later static call processing + * during klp module build will be skipped when it sees this + * section already exists. + */ + ".static_call_sites", + }; + + static const char * const non_special_discards[] = { + ".discard.addressable", + ".discard.sym_checksum", + }; + + if (is_text_sec(sec)) + return false; + + for (int i = 0; i < ARRAY_SIZE(specials); i++) { + if (!strcmp(sec->name, specials[i])) + return true; + } + + /* Most .discard data sections are special */ + for (int i = 0; i < ARRAY_SIZE(non_special_discards); i++) { + if (!strcmp(sec->name, non_special_discards[i])) + return false; + } + + return strstarts(sec->name, ".discard."); +} + +/* + * These sections are referenced by special sections but aren't considered + * special sections themselves. + */ +static bool is_special_section_aux(struct section *sec) +{ + static const char * const specials_aux[] = { + ".altinstr_replacement", + ".altinstr_aux", + }; + + for (int i = 0; i < ARRAY_SIZE(specials_aux); i++) { + if (!strcmp(sec->name, specials_aux[i])) + return true; + } + + return false; +} + +/* + * These symbols should never be correlated, so their local patched versions + * are used instead of linking to the originals. + */ +static bool dont_correlate(struct symbol *sym) +{ + return is_file_sym(sym) || + is_null_sym(sym) || + is_sec_sym(sym) || + is_prefix_func(sym) || + is_uncorrelated_static_local(sym) || + is_clang_tmp_label(sym) || + is_string_sec(sym->sec) || + is_special_section(sym->sec) || + is_special_section_aux(sym->sec) || + strstarts(sym->name, "__initcall__"); +} + +/* + * For each symbol in the original kernel, find its corresponding "twin" in the + * patched kernel. + */ +static int correlate_symbols(struct elfs *e) +{ + struct symbol *file1_sym, *file2_sym; + struct symbol *sym1, *sym2; + + /* Correlate locals */ + for (file1_sym = first_file_symbol(e->orig), + file2_sym = first_file_symbol(e->patched); ; + file1_sym = next_file_symbol(e->orig, file1_sym), + file2_sym = next_file_symbol(e->patched, file2_sym)) { + + if (!file1_sym && file2_sym) { + ERROR("FILE symbol mismatch: NULL != %s", file2_sym->name); + return -1; + } + + if (file1_sym && !file2_sym) { + ERROR("FILE symbol mismatch: %s != NULL", file1_sym->name); + return -1; + } + + if (!file1_sym) + break; + + if (strcmp(file1_sym->name, file2_sym->name)) { + ERROR("FILE symbol mismatch: %s != %s", file1_sym->name, file2_sym->name); + return -1; + } + + file1_sym->twin = file2_sym; + file2_sym->twin = file1_sym; + + sym1 = file1_sym; + + for_each_sym_continue(e->orig, sym1) { + if (is_file_sym(sym1) || !is_local_sym(sym1)) + break; + + if (dont_correlate(sym1)) + continue; + + sym2 = file2_sym; + for_each_sym_continue(e->patched, sym2) { + if (is_file_sym(sym2) || !is_local_sym(sym2)) + break; + + if (sym2->twin || dont_correlate(sym2)) + continue; + + if (strcmp(sym1->demangled_name, sym2->demangled_name)) + continue; + + sym1->twin = sym2; + sym2->twin = sym1; + break; + } + } + } + + /* Correlate globals */ + for_each_sym(e->orig, sym1) { + if (sym1->bind == STB_LOCAL) + continue; + + sym2 = find_global_symbol_by_name(e->patched, sym1->name); + + if (sym2 && !sym2->twin && !strcmp(sym1->name, sym2->name)) { + sym1->twin = sym2; + sym2->twin = sym1; + } + } + + for_each_sym(e->orig, sym1) { + if (sym1->twin || dont_correlate(sym1)) + continue; + WARN("no correlation: %s", sym1->name); + } + + return 0; +} + +/* "sympos" is used by livepatch to disambiguate duplicate symbol names */ +static unsigned long find_sympos(struct elf *elf, struct symbol *sym) +{ + bool vmlinux = str_ends_with(objname, "vmlinux.o"); + unsigned long sympos = 0, nr_matches = 0; + bool has_dup = false; + struct symbol *s; + + if (sym->bind != STB_LOCAL) + return 0; + + if (vmlinux && sym->type == STT_FUNC) { + /* + * HACK: Unfortunately, symbol ordering can differ between + * vmlinux.o and vmlinux due to the linker script emitting + * .text.unlikely* before .text*. Count .text.unlikely* first. + * + * TODO: Disambiguate symbols more reliably (checksums?) + */ + for_each_sym(elf, s) { + if (strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + for_each_sym(elf, s) { + if (!strstarts(s->sec->name, ".text.unlikely") && + !strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } else { + for_each_sym(elf, s) { + if (!strcmp(s->name, sym->name)) { + nr_matches++; + if (s == sym) + sympos = nr_matches; + else + has_dup = true; + } + } + } + + if (!sympos) { + ERROR("can't find sympos for %s", sym->name); + return ULONG_MAX; + } + + return has_dup ? sympos : 0; +} + +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym); + +static struct symbol *__clone_symbol(struct elf *elf, struct symbol *patched_sym, + bool data_too) +{ + struct section *out_sec = NULL; + unsigned long offset = 0; + struct symbol *out_sym; + + if (data_too && !is_undef_sym(patched_sym)) { + struct section *patched_sec = patched_sym->sec; + + out_sec = find_section_by_name(elf, patched_sec->name); + if (!out_sec) { + out_sec = elf_create_section(elf, patched_sec->name, 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return NULL; + } + + if (is_string_sec(patched_sym->sec)) { + out_sym = elf_create_section_symbol(elf, out_sec); + if (!out_sym) + return NULL; + + goto sym_created; + } + + if (!is_sec_sym(patched_sym)) + offset = sec_size(out_sec); + + if (patched_sym->len || is_sec_sym(patched_sym)) { + void *data = NULL; + size_t size; + + /* bss doesn't have data */ + if (patched_sym->sec->data->d_buf) + data = patched_sym->sec->data->d_buf + patched_sym->offset; + + if (is_sec_sym(patched_sym)) + size = sec_size(patched_sym->sec); + else + size = patched_sym->len; + + if (!elf_add_data(elf, out_sec, data, size)) + return NULL; + } + } + + out_sym = elf_create_symbol(elf, patched_sym->name, out_sec, + patched_sym->bind, patched_sym->type, + offset, patched_sym->len); + if (!out_sym) + return NULL; + +sym_created: + patched_sym->clone = out_sym; + out_sym->clone = patched_sym; + + return out_sym; +} + +static const char *sym_type(struct symbol *sym) +{ + switch (sym->type) { + case STT_NOTYPE: return "NOTYPE"; + case STT_OBJECT: return "OBJECT"; + case STT_FUNC: return "FUNC"; + case STT_SECTION: return "SECTION"; + case STT_FILE: return "FILE"; + default: return "UNKNOWN"; + } +} + +static const char *sym_bind(struct symbol *sym) +{ + switch (sym->bind) { + case STB_LOCAL: return "LOCAL"; + case STB_GLOBAL: return "GLOBAL"; + case STB_WEAK: return "WEAK"; + default: return "UNKNOWN"; + } +} + +/* + * Copy a symbol to the output object, optionally including its data and + * relocations. + */ +static struct symbol *clone_symbol(struct elfs *e, struct symbol *patched_sym, + bool data_too) +{ + struct symbol *pfx; + + if (patched_sym->clone) + return patched_sym->clone; + + dbg_indent("%s%s", patched_sym->name, data_too ? " [+DATA]" : ""); + + /* Make sure the prefix gets cloned first */ + if (is_func_sym(patched_sym) && data_too) { + pfx = get_func_prefix(patched_sym); + if (pfx) + clone_symbol(e, pfx, true); + } + + if (!__clone_symbol(e->out, patched_sym, data_too)) + return NULL; + + if (data_too && clone_sym_relocs(e, patched_sym)) + return NULL; + + return patched_sym->clone; +} + +static void mark_included_function(struct symbol *func) +{ + struct symbol *pfx; + + func->included = 1; + + /* Include prefix function */ + pfx = get_func_prefix(func); + if (pfx) + pfx->included = 1; + + /* Make sure .cold parent+child always stay together */ + if (func->cfunc && func->cfunc != func) + func->cfunc->included = 1; + if (func->pfunc && func->pfunc != func) + func->pfunc->included = 1; +} + +/* + * Copy all changed functions (and their dependencies) from the patched object + * to the output object. + */ +static int mark_changed_functions(struct elfs *e) +{ + struct symbol *sym_orig, *patched_sym; + bool changed = false; + + /* Find changed functions */ + for_each_sym(e->orig, sym_orig) { + if (!is_func_sym(sym_orig) || is_prefix_func(sym_orig)) + continue; + + patched_sym = sym_orig->twin; + if (!patched_sym) + continue; + + if (sym_orig->csum.checksum != patched_sym->csum.checksum) { + patched_sym->changed = 1; + mark_included_function(patched_sym); + changed = true; + } + } + + /* Find added functions and print them */ + for_each_sym(e->patched, patched_sym) { + if (!is_func_sym(patched_sym) || is_prefix_func(patched_sym)) + continue; + + if (!patched_sym->twin) { + printf("%s: new function: %s\n", objname, patched_sym->name); + mark_included_function(patched_sym); + changed = true; + } + } + + /* Print changed functions */ + for_each_sym(e->patched, patched_sym) { + if (patched_sym->changed) + printf("%s: changed function: %s\n", objname, patched_sym->name); + } + + return !changed ? -1 : 0; +} + +static int clone_included_functions(struct elfs *e) +{ + struct symbol *patched_sym; + + for_each_sym(e->patched, patched_sym) { + if (patched_sym->included) { + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + } + + return 0; +} + +/* + * Determine whether a relocation should reference the section rather than the + * underlying symbol. + */ +static bool section_reference_needed(struct section *sec) +{ + /* + * String symbols are zero-length and uncorrelated. It's easier to + * deal with them as section symbols. + */ + if (is_string_sec(sec)) + return true; + + /* + * .rodata has mostly anonymous data so there's no way to determine the + * length of a needed reference. just copy the whole section if needed. + */ + if (strstarts(sec->name, ".rodata")) + return true; + + /* UBSAN anonymous data */ + if (strstarts(sec->name, ".data..Lubsan") || /* GCC */ + strstarts(sec->name, ".data..L__unnamed_")) /* Clang */ + return true; + + return false; +} + +static bool is_reloc_allowed(struct reloc *reloc) +{ + return section_reference_needed(reloc->sym->sec) == is_sec_sym(reloc->sym); +} + +static struct export *find_export(struct symbol *sym) +{ + struct export *export; + + hash_for_each_possible(exports, export, hash, str_hash(sym->name)) { + if (!strcmp(export->sym, sym->name)) + return export; + } + + return NULL; +} + +static const char *__find_modname(struct elfs *e) +{ + struct section *sec; + char *name; + + sec = find_section_by_name(e->orig, ".modinfo"); + if (!sec) { + ERROR("missing .modinfo section"); + return NULL; + } + + name = memmem(sec->data->d_buf, sec_size(sec), "\0name=", 6); + if (name) + return name + 6; + + name = strdup(e->orig->name); + if (!name) { + ERROR_GLIBC("strdup"); + return NULL; + } + + for (char *c = name; *c; c++) { + if (*c == '/') + name = c + 1; + else if (*c == '-') + *c = '_'; + else if (*c == '.') { + *c = '\0'; + break; + } + } + + return name; +} + +/* Get the object's module name as defined by the kernel (and klp_object) */ +static const char *find_modname(struct elfs *e) +{ + const char *modname; + + if (e->modname) + return e->modname; + + modname = __find_modname(e); + e->modname = modname; + return modname; +} + +/* + * Copying a function from its native compiled environment to a kernel module + * removes its natural access to local functions/variables and unexported + * globals. References to such symbols need to be converted to KLP relocs so + * the kernel arch relocation code knows to apply them and where to find the + * symbols. Particularly, duplicate static symbols need to be disambiguated. + */ +static bool klp_reloc_needed(struct reloc *patched_reloc) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export; + + /* no external symbol to reference */ + if (dont_correlate(patched_sym)) + return false; + + /* For included functions, a regular reloc will do. */ + if (patched_sym->included) + return false; + + /* + * If exported by a module, it has to be a klp reloc. Thanks to the + * clusterfunk that is late module patching, the patch module is + * allowed to be loaded before any modules it depends on. + * + * If exported by vmlinux, a normal reloc will do. + */ + export = find_export(patched_sym); + if (export) + return strcmp(export->mod, "vmlinux"); + + if (!patched_sym->twin) { + /* + * Presumably the symbol and its reference were added by the + * patch. The symbol could be defined in this .o or in another + * .o in the patch module. + * + * This check needs to be *after* the export check due to the + * possibility of the patch adding a new UNDEF reference to an + * exported symbol. + */ + return false; + } + + /* Unexported symbol which lives in the original vmlinux or module. */ + return true; +} + +static int convert_reloc_sym_to_secsym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + if (!sec->sym && !elf_create_section_symbol(elf, sec)) + return -1; + + reloc->sym = sec->sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, sym->offset + reloc_addend(reloc)); + return 0; +} + +static int convert_reloc_secsym_to_sym(struct elf *elf, struct reloc *reloc) +{ + struct symbol *sym = reloc->sym; + struct section *sec = sym->sec; + + /* If the symbol has a dedicated section, it's easy to find */ + sym = find_symbol_by_offset(sec, 0); + if (sym && sym->len == sec_size(sec)) + goto found_sym; + + /* No dedicated section; find the symbol manually */ + sym = find_symbol_containing(sec, arch_adjusted_addend(reloc)); + if (!sym) { + /* + * This can happen for special section references to weak code + * whose symbol has been stripped by the linker. + */ + return -1; + } + +found_sym: + reloc->sym = sym; + set_reloc_sym(elf, reloc, sym->idx); + set_reloc_addend(elf, reloc, reloc_addend(reloc) - sym->offset); + return 0; +} + +/* + * Convert a relocation symbol reference to the needed format: either a section + * symbol or the underlying symbol itself. + */ +static int convert_reloc_sym(struct elf *elf, struct reloc *reloc) +{ + if (is_reloc_allowed(reloc)) + return 0; + + if (section_reference_needed(reloc->sym->sec)) + return convert_reloc_sym_to_secsym(elf, reloc); + else + return convert_reloc_secsym_to_sym(elf, reloc); +} + +/* + * Convert a regular relocation to a klp relocation (sort of). + */ +static int clone_reloc_klp(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset, + struct export *export) +{ + struct symbol *patched_sym = patched_reloc->sym; + s64 addend = reloc_addend(patched_reloc); + const char *sym_modname, *sym_orig_name; + static struct section *klp_relocs; + struct symbol *sym, *klp_sym; + unsigned long klp_reloc_off; + char sym_name[SYM_NAME_LEN]; + struct klp_reloc klp_reloc; + unsigned long sympos; + + if (!patched_sym->twin) { + ERROR("unexpected klp reloc for new symbol %s", patched_sym->name); + return -1; + } + + /* + * Keep the original reloc intact for now to avoid breaking objtool run + * which relies on proper relocations for many of its features. This + * will be disabled later by "objtool klp post-link". + * + * Convert it to UNDEF (and WEAK to avoid modpost warnings). + */ + + sym = patched_sym->clone; + if (!sym) { + /* STB_WEAK: avoid modpost undefined symbol warnings */ + sym = elf_create_symbol(e->out, patched_sym->name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!sym) + return -1; + + patched_sym->clone = sym; + sym->clone = patched_sym; + } + + if (!elf_create_reloc(e->out, sec, offset, sym, addend, reloc_type(patched_reloc))) + return -1; + + /* + * Create the KLP symbol. + */ + + if (export) { + sym_modname = export->mod; + sym_orig_name = export->sym; + sympos = 0; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + + sym_orig_name = patched_sym->twin->name; + sympos = find_sympos(e->orig, patched_sym->twin); + if (sympos == ULONG_MAX) + return -1; + } + + /* symbol format: .klp.sym.modname.sym_name,sympos */ + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_SYM_PREFIX "%s.%s,%ld", + sym_modname, sym_orig_name, sympos)) + return -1; + + klp_sym = find_symbol_by_name(e->out, sym_name); + if (!klp_sym) { + __dbg_indent("%s", sym_name); + + /* STB_WEAK: avoid modpost undefined symbol warnings */ + klp_sym = elf_create_symbol(e->out, sym_name, NULL, + STB_WEAK, patched_sym->type, 0, 0); + if (!klp_sym) + return -1; + } + + /* + * Create the __klp_relocs entry. This will be converted to an actual + * KLP rela by "objtool klp post-link". + * + * This intermediate step is necessary to prevent corruption by the + * linker, which doesn't know how to properly handle two rela sections + * applying to the same base section. + */ + + if (!klp_relocs) { + klp_relocs = elf_create_section(e->out, KLP_RELOCS_SEC, 0, + 0, SHT_PROGBITS, 8, SHF_ALLOC); + if (!klp_relocs) + return -1; + } + + klp_reloc_off = sec_size(klp_relocs); + memset(&klp_reloc, 0, sizeof(klp_reloc)); + + klp_reloc.type = reloc_type(patched_reloc); + if (!elf_add_data(e->out, klp_relocs, &klp_reloc, sizeof(klp_reloc))) + return -1; + + /* klp_reloc.offset */ + if (!sec->sym && !elf_create_section_symbol(e->out, sec)) + return -1; + + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, offset), + sec->sym, offset, R_ABS64)) + return -1; + + /* klp_reloc.sym */ + if (!elf_create_reloc(e->out, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, sym), + klp_sym, addend, R_ABS64)) + return -1; + + return 0; +} + +#define dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp) \ + dbg_indent("%s+0x%lx: %s%s0x%lx [%s%s%s%s%s%s]", \ + sec->name, offset, patched_sym->name, \ + addend >= 0 ? "+" : "-", labs(addend), \ + sym_type(patched_sym), \ + patched_sym->type == STT_SECTION ? "" : " ", \ + patched_sym->type == STT_SECTION ? "" : sym_bind(patched_sym), \ + is_undef_sym(patched_sym) ? " UNDEF" : "", \ + export ? " EXPORTED" : "", \ + klp ? " KLP" : "") + +/* Copy a reloc and its symbol to the output object */ +static int clone_reloc(struct elfs *e, struct reloc *patched_reloc, + struct section *sec, unsigned long offset) +{ + struct symbol *patched_sym = patched_reloc->sym; + struct export *export = find_export(patched_sym); + long addend = reloc_addend(patched_reloc); + struct symbol *out_sym; + bool klp; + + if (!is_reloc_allowed(patched_reloc)) { + ERROR_FUNC(patched_reloc->sec->base, reloc_offset(patched_reloc), + "missing symbol for reference to %s+%ld", + patched_sym->name, addend); + return -1; + } + + klp = klp_reloc_needed(patched_reloc); + + dbg_clone_reloc(sec, offset, patched_sym, addend, export, klp); + + if (klp) { + if (clone_reloc_klp(e, patched_reloc, sec, offset, export)) + return -1; + + return 0; + } + + /* + * Why !export sets 'data_too': + * + * Unexported non-klp symbols need to live in the patch module, + * otherwise there will be unresolved symbols. Notably, this includes: + * + * - New functions/data + * - String sections + * - Special section entries + * - Uncorrelated static local variables + * - UBSAN sections + */ + out_sym = clone_symbol(e, patched_sym, patched_sym->included || !export); + if (!out_sym) + return -1; + + /* + * For strings, all references use section symbols, thanks to + * section_reference_needed(). clone_symbol() has cloned an empty + * version of the string section. Now copy the string itself. + */ + if (is_string_sec(patched_sym->sec)) { + const char *str = patched_sym->sec->data->d_buf + addend; + + __dbg_indent("\"%s\"", escape_str(str)); + + addend = elf_add_string(e->out, out_sym->sec, str); + if (addend == -1) + return -1; + } + + if (!elf_create_reloc(e->out, sec, offset, out_sym, addend, + reloc_type(patched_reloc))) + return -1; + + return 0; +} + +/* Copy all relocs needed for a symbol's contents */ +static int clone_sym_relocs(struct elfs *e, struct symbol *patched_sym) +{ + struct section *patched_rsec = patched_sym->sec->rsec; + struct reloc *patched_reloc; + unsigned long start, end; + struct symbol *out_sym; + + out_sym = patched_sym->clone; + if (!out_sym) { + ERROR("no clone for %s", patched_sym->name); + return -1; + } + + if (!patched_rsec) + return 0; + + if (!is_sec_sym(patched_sym) && !patched_sym->len) + return 0; + + if (is_string_sec(patched_sym->sec)) + return 0; + + if (is_sec_sym(patched_sym)) { + start = 0; + end = sec_size(patched_sym->sec); + } else { + start = patched_sym->offset; + end = start + patched_sym->len; + } + + for_each_reloc(patched_rsec, patched_reloc) { + unsigned long offset; + + if (reloc_offset(patched_reloc) < start || + reloc_offset(patched_reloc) >= end) + continue; + + /* + * Skip any reloc referencing .altinstr_aux. Its code is + * always patched by alternatives. See ALTERNATIVE_TERNARY(). + */ + if (patched_reloc->sym->sec && + !strcmp(patched_reloc->sym->sec->name, ".altinstr_aux")) + continue; + + if (convert_reloc_sym(e->patched, patched_reloc)) { + ERROR_FUNC(patched_rsec->base, reloc_offset(patched_reloc), + "failed to convert reloc sym '%s' to its proper format", + patched_reloc->sym->name); + return -1; + } + + offset = out_sym->offset + (reloc_offset(patched_reloc) - patched_sym->offset); + + if (clone_reloc(e, patched_reloc, out_sym->sec, offset)) + return -1; + } + return 0; + +} + +static int create_fake_symbol(struct elf *elf, struct section *sec, + unsigned long offset, size_t size) +{ + char name[SYM_NAME_LEN]; + unsigned int type; + static int ctr; + char *c; + + if (snprintf_check(name, SYM_NAME_LEN, "%s_%d", sec->name, ctr++)) + return -1; + + for (c = name; *c; c++) + if (*c == '.') + *c = '_'; + + /* + * STT_NOTYPE: Prevent objtool from validating .altinstr_replacement + * while still allowing objdump to disassemble it. + */ + type = is_text_sec(sec) ? STT_NOTYPE : STT_OBJECT; + return elf_create_symbol(elf, name, sec, STB_LOCAL, type, offset, size) ? 0 : -1; +} + +/* + * Special sections (alternatives, etc) are basically arrays of structs. + * For all the special sections, create a symbol for each struct entry. This + * is a bit cumbersome, but it makes the extracting of the individual entries + * much more straightforward. + * + * There are three ways to identify the entry sizes for a special section: + * + * 1) ELF section header sh_entsize: Ideally this would be used almost + * everywhere. But unfortunately the toolchains make it difficult. The + * assembler .[push]section directive syntax only takes entsize when + * combined with SHF_MERGE. But Clang disallows combining SHF_MERGE with + * SHF_WRITE. And some special sections do need to be writable. + * + * Another place this wouldn't work is .altinstr_replacement, whose entries + * don't have a fixed size. + * + * 2) ANNOTATE_DATA_SPECIAL: This is a lightweight objtool annotation which + * points to the beginning of each entry. The size of the entry is then + * inferred by the location of the subsequent annotation (or end of + * section). + * + * 3) Simple array of pointers: If the special section is just a basic array of + * pointers, the entry size can be inferred by the number of relocations. + * No annotations needed. + * + * Note I also tried to create per-entry symbols at the time of creation, in + * the original [inline] asm. Unfortunately, creating uniquely named symbols + * is trickier than one might think, especially with Clang inline asm. I + * eventually just gave up trying to make that work, in favor of using + * ANNOTATE_DATA_SPECIAL and creating the symbols here after the fact. + */ +static int create_fake_symbols(struct elf *elf) +{ + struct section *sec; + struct reloc *reloc; + + /* + * 1) Make symbols for all the ANNOTATE_DATA_SPECIAL entries: + */ + + sec = find_section_by_name(elf, ".discard.annotate_data"); + if (!sec || !sec->rsec) + return 0; + + for_each_reloc(sec->rsec, reloc) { + unsigned long offset, size; + struct reloc *next_reloc; + + if (annotype(elf, sec, reloc) != ANNOTYPE_DATA_SPECIAL) + continue; + + offset = reloc_addend(reloc); + + size = 0; + next_reloc = reloc; + for_each_reloc_continue(sec->rsec, next_reloc) { + if (annotype(elf, sec, next_reloc) != ANNOTYPE_DATA_SPECIAL || + next_reloc->sym->sec != reloc->sym->sec) + continue; + + size = reloc_addend(next_reloc) - offset; + break; + } + + if (!size) + size = sec_size(reloc->sym->sec) - offset; + + if (create_fake_symbol(elf, reloc->sym->sec, offset, size)) + return -1; + } + + /* + * 2) Make symbols for sh_entsize, and simple arrays of pointers: + */ + + for_each_sec(elf, sec) { + unsigned int entry_size; + unsigned long offset; + + if (!is_special_section(sec) || find_symbol_by_offset(sec, 0)) + continue; + + if (!sec->rsec) { + ERROR("%s: missing special section relocations", sec->name); + return -1; + } + + entry_size = sec->sh.sh_entsize; + if (!entry_size) { + entry_size = arch_reloc_size(sec->rsec->relocs); + if (sec_size(sec) != entry_size * sec_num_entries(sec->rsec)) { + ERROR("%s: missing special section entsize or annotations", sec->name); + return -1; + } + } + + for (offset = 0; offset < sec_size(sec); offset += entry_size) { + if (create_fake_symbol(elf, sec, offset, entry_size)) + return -1; + } + } + + return 0; +} + +/* Keep a special section entry if it references an included function */ +static bool should_keep_special_sym(struct elf *elf, struct symbol *sym) +{ + struct reloc *reloc; + + if (is_sec_sym(sym) || !sym->sec->rsec) + return false; + + sym_for_each_reloc(elf, sym, reloc) { + if (convert_reloc_sym(elf, reloc)) + continue; + + if (is_func_sym(reloc->sym) && reloc->sym->included) + return true; + } + + return false; +} + +/* + * Klp relocations aren't allowed for __jump_table and .static_call_sites if + * the referenced symbol lives in a kernel module, because such klp relocs may + * be applied after static branch/call init, resulting in code corruption. + * + * Validate a special section entry to avoid that. Note that an inert + * tracepoint is harmless enough, in that case just skip the entry and print a + * warning. Otherwise, return an error. + * + * This is only a temporary limitation which will be fixed when livepatch adds + * support for submodules: fully self-contained modules which are embedded in + * the top-level livepatch module's data and which can be loaded on demand when + * their corresponding to-be-patched module gets loaded. Then klp relocs can + * be retired. + * + * Return: + * -1: error: validation failed + * 1: warning: tracepoint skipped + * 0: success + */ +static int validate_special_section_klp_reloc(struct elfs *e, struct symbol *sym) +{ + bool static_branch = !strcmp(sym->sec->name, "__jump_table"); + bool static_call = !strcmp(sym->sec->name, ".static_call_sites"); + struct symbol *code_sym = NULL; + unsigned long code_offset = 0; + struct reloc *reloc; + int ret = 0; + + if (!static_branch && !static_call) + return 0; + + sym_for_each_reloc(e->patched, sym, reloc) { + const char *sym_modname; + struct export *export; + + /* Static branch/call keys are always STT_OBJECT */ + if (reloc->sym->type != STT_OBJECT) { + + /* Save code location which can be printed below */ + if (reloc->sym->type == STT_FUNC && !code_sym) { + code_sym = reloc->sym; + code_offset = reloc_addend(reloc); + } + + continue; + } + + if (!klp_reloc_needed(reloc)) + continue; + + export = find_export(reloc->sym); + if (export) { + sym_modname = export->mod; + } else { + sym_modname = find_modname(e); + if (!sym_modname) + return -1; + } + + /* vmlinux keys are ok */ + if (!strcmp(sym_modname, "vmlinux")) + continue; + + if (static_branch) { + if (strstarts(reloc->sym->name, "__tracepoint_")) { + WARN("%s: disabling unsupported tracepoint %s", + code_sym->name, reloc->sym->name + 13); + ret = 1; + continue; + } + + ERROR("%s+0x%lx: unsupported static branch key %s. Use static_key_enabled() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + /* static call */ + if (strstarts(reloc->sym->name, "__SCK__tp_func_")) { + ret = 1; + continue; + } + + ERROR("%s()+0x%lx: unsupported static call key %s. Use KLP_STATIC_CALL() instead", + code_sym->name, code_offset, reloc->sym->name); + return -1; + } + + return ret; +} + +static int clone_special_section(struct elfs *e, struct section *patched_sec) +{ + struct symbol *patched_sym; + + /* + * Extract all special section symbols (and their dependencies) which + * reference included functions. + */ + sec_for_each_sym(patched_sec, patched_sym) { + int ret; + + if (!is_object_sym(patched_sym)) + continue; + + if (!should_keep_special_sym(e->patched, patched_sym)) + continue; + + ret = validate_special_section_klp_reloc(e, patched_sym); + if (ret < 0) + return -1; + if (ret > 0) + continue; + + if (!clone_symbol(e, patched_sym, true)) + return -1; + } + + return 0; +} + +/* Extract only the needed bits from special sections */ +static int clone_special_sections(struct elfs *e) +{ + struct section *patched_sec; + + if (create_fake_symbols(e->patched)) + return -1; + + for_each_sec(e->patched, patched_sec) { + if (is_special_section(patched_sec)) { + if (clone_special_section(e, patched_sec)) + return -1; + } + } + + return 0; +} + +/* + * Create __klp_objects and __klp_funcs sections which are intermediate + * sections provided as input to the patch module's init code for building the + * klp_patch, klp_object and klp_func structs for the livepatch API. + */ +static int create_klp_sections(struct elfs *e) +{ + size_t obj_size = sizeof(struct klp_object_ext); + size_t func_size = sizeof(struct klp_func_ext); + struct section *obj_sec, *funcs_sec, *str_sec; + struct symbol *funcs_sym, *str_sym, *sym; + char sym_name[SYM_NAME_LEN]; + unsigned int nr_funcs = 0; + const char *modname; + void *obj_data; + s64 addend; + + obj_sec = elf_create_section_pair(e->out, KLP_OBJECTS_SEC, obj_size, 0, 0); + if (!obj_sec) + return -1; + + funcs_sec = elf_create_section_pair(e->out, KLP_FUNCS_SEC, func_size, 0, 0); + if (!funcs_sec) + return -1; + + funcs_sym = elf_create_section_symbol(e->out, funcs_sec); + if (!funcs_sym) + return -1; + + str_sec = elf_create_section(e->out, KLP_STRINGS_SEC, 0, 0, + SHT_PROGBITS, 1, + SHF_ALLOC | SHF_STRINGS | SHF_MERGE); + if (!str_sec) + return -1; + + if (elf_add_string(e->out, str_sec, "") == -1) + return -1; + + str_sym = elf_create_section_symbol(e->out, str_sec); + if (!str_sym) + return -1; + + /* allocate klp_object_ext */ + obj_data = elf_add_data(e->out, obj_sec, NULL, obj_size); + if (!obj_data) + return -1; + + modname = find_modname(e); + if (!modname) + return -1; + + /* klp_object_ext.name */ + if (strcmp(modname, "vmlinux")) { + addend = elf_add_string(e->out, str_sec, modname); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, name), + str_sym, addend, R_ABS64)) + return -1; + } + + /* klp_object_ext.funcs */ + if (!elf_create_reloc(e->out, obj_sec, offsetof(struct klp_object_ext, funcs), + funcs_sym, 0, R_ABS64)) + return -1; + + for_each_sym(e->out, sym) { + unsigned long offset = nr_funcs * func_size; + unsigned long sympos; + void *func_data; + + if (!is_func_sym(sym) || sym->cold || !sym->clone || !sym->clone->changed) + continue; + + /* allocate klp_func_ext */ + func_data = elf_add_data(e->out, funcs_sec, NULL, func_size); + if (!func_data) + return -1; + + /* klp_func_ext.old_name */ + addend = elf_add_string(e->out, str_sec, sym->clone->twin->name); + if (addend == -1) + return -1; + + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, old_name), + str_sym, addend, R_ABS64)) + return -1; + + /* klp_func_ext.new_func */ + if (!elf_create_reloc(e->out, funcs_sec, + offset + offsetof(struct klp_func_ext, new_func), + sym, 0, R_ABS64)) + return -1; + + /* klp_func_ext.sympos */ + BUILD_BUG_ON(sizeof(sympos) != sizeof_field(struct klp_func_ext, sympos)); + sympos = find_sympos(e->orig, sym->clone->twin); + if (sympos == ULONG_MAX) + return -1; + memcpy(func_data + offsetof(struct klp_func_ext, sympos), &sympos, + sizeof_field(struct klp_func_ext, sympos)); + + nr_funcs++; + } + + /* klp_object_ext.nr_funcs */ + BUILD_BUG_ON(sizeof(nr_funcs) != sizeof_field(struct klp_object_ext, nr_funcs)); + memcpy(obj_data + offsetof(struct klp_object_ext, nr_funcs), &nr_funcs, + sizeof_field(struct klp_object_ext, nr_funcs)); + + /* + * Find callback pointers created by KLP_PRE_PATCH_CALLBACK() and + * friends, and add them to the klp object. + */ + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_PATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_patch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_PRE_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, pre_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + if (snprintf_check(sym_name, SYM_NAME_LEN, KLP_POST_UNPATCH_PREFIX "%s", modname)) + return -1; + + sym = find_symbol_by_name(e->out, sym_name); + if (sym) { + struct reloc *reloc; + + reloc = find_reloc_by_dest(e->out, sym->sec, sym->offset); + + if (!elf_create_reloc(e->out, obj_sec, + offsetof(struct klp_object_ext, callbacks) + + offsetof(struct klp_callbacks, post_unpatch), + reloc->sym, reloc_addend(reloc), R_ABS64)) + return -1; + } + + return 0; +} + +/* + * Copy all .modinfo import_ns= tags to ensure all namespaced exported symbols + * can be accessed via normal relocs. + */ +static int copy_import_ns(struct elfs *e) +{ + struct section *patched_sec, *out_sec = NULL; + char *import_ns, *data_end; + + patched_sec = find_section_by_name(e->patched, ".modinfo"); + if (!patched_sec) + return 0; + + import_ns = patched_sec->data->d_buf; + if (!import_ns) + return 0; + + for (data_end = import_ns + sec_size(patched_sec); + import_ns < data_end; + import_ns += strlen(import_ns) + 1) { + + import_ns = memmem(import_ns, data_end - import_ns, "import_ns=", 10); + if (!import_ns) + return 0; + + if (!out_sec) { + out_sec = find_section_by_name(e->out, ".modinfo"); + if (!out_sec) { + out_sec = elf_create_section(e->out, ".modinfo", 0, + patched_sec->sh.sh_entsize, + patched_sec->sh.sh_type, + patched_sec->sh.sh_addralign, + patched_sec->sh.sh_flags); + if (!out_sec) + return -1; + } + } + + if (!elf_add_data(e->out, out_sec, import_ns, strlen(import_ns) + 1)) + return -1; + } + + return 0; +} + +int cmd_klp_diff(int argc, const char **argv) +{ + struct elfs e = {0}; + + argc = parse_options(argc, argv, klp_diff_options, klp_diff_usage, 0); + if (argc != 3) + usage_with_options(klp_diff_usage, klp_diff_options); + + objname = argv[0]; + + e.orig = elf_open_read(argv[0], O_RDONLY); + e.patched = elf_open_read(argv[1], O_RDONLY); + e.out = NULL; + + if (!e.orig || !e.patched) + return -1; + + if (read_exports()) + return -1; + + if (read_sym_checksums(e.orig)) + return -1; + + if (read_sym_checksums(e.patched)) + return -1; + + if (correlate_symbols(&e)) + return -1; + + if (mark_changed_functions(&e)) + return 0; + + e.out = elf_create_file(&e.orig->ehdr, argv[2]); + if (!e.out) + return -1; + + if (clone_included_functions(&e)) + return -1; + + if (clone_special_sections(&e)) + return -1; + + if (create_klp_sections(&e)) + return -1; + + if (copy_import_ns(&e)) + return -1; + + if (elf_write(e.out)) + return -1; + + return elf_close(e.out); +} diff --git a/tools/objtool/klp-post-link.c b/tools/objtool/klp-post-link.c new file mode 100644 index 000000000000..c013e39957b1 --- /dev/null +++ b/tools/objtool/klp-post-link.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Read the intermediate KLP reloc/symbol representations created by klp diff + * and convert them to the proper format required by livepatch. This needs to + * run last to avoid linker wreckage. Linkers don't tend to handle the "two + * rela sections for a single base section" case very well, nor do they like + * SHN_LIVEPATCH. + * + * This is the final tool in the livepatch module generation pipeline: + * + * kernel builds -> objtool klp diff -> module link -> objtool klp post-link + */ + +#include <fcntl.h> +#include <gelf.h> +#include <objtool/objtool.h> +#include <objtool/warn.h> +#include <objtool/klp.h> +#include <objtool/util.h> +#include <linux/livepatch_external.h> + +static int fix_klp_relocs(struct elf *elf) +{ + struct section *symtab, *klp_relocs; + + klp_relocs = find_section_by_name(elf, KLP_RELOCS_SEC); + if (!klp_relocs) + return 0; + + symtab = find_section_by_name(elf, ".symtab"); + if (!symtab) { + ERROR("missing .symtab"); + return -1; + } + + for (int i = 0; i < sec_size(klp_relocs) / sizeof(struct klp_reloc); i++) { + struct klp_reloc *klp_reloc; + unsigned long klp_reloc_off; + struct section *sec, *tmp, *klp_rsec; + unsigned long offset; + struct reloc *reloc; + char sym_modname[64]; + char rsec_name[SEC_NAME_LEN]; + u64 addend; + struct symbol *sym, *klp_sym; + + klp_reloc_off = i * sizeof(*klp_reloc); + klp_reloc = klp_relocs->data->d_buf + klp_reloc_off; + + /* + * Read __klp_relocs[i]: + */ + + /* klp_reloc.sec_offset */ + reloc = find_reloc_by_dest(elf, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, offset)); + if (!reloc) { + ERROR("malformed " KLP_RELOCS_SEC " section"); + return -1; + } + + sec = reloc->sym->sec; + offset = reloc_addend(reloc); + + /* klp_reloc.sym */ + reloc = find_reloc_by_dest(elf, klp_relocs, + klp_reloc_off + offsetof(struct klp_reloc, sym)); + if (!reloc) { + ERROR("malformed " KLP_RELOCS_SEC " section"); + return -1; + } + + klp_sym = reloc->sym; + addend = reloc_addend(reloc); + + /* symbol format: .klp.sym.modname.sym_name,sympos */ + if (sscanf(klp_sym->name + strlen(KLP_SYM_PREFIX), "%55[^.]", sym_modname) != 1) + ERROR("can't find modname in klp symbol '%s'", klp_sym->name); + + /* + * Create the KLP rela: + */ + + /* section format: .klp.rela.sec_objname.section_name */ + if (snprintf_check(rsec_name, SEC_NAME_LEN, + KLP_RELOC_SEC_PREFIX "%s.%s", + sym_modname, sec->name)) + return -1; + + klp_rsec = find_section_by_name(elf, rsec_name); + if (!klp_rsec) { + klp_rsec = elf_create_section(elf, rsec_name, 0, + elf_rela_size(elf), + SHT_RELA, elf_addr_size(elf), + SHF_ALLOC | SHF_INFO_LINK | SHF_RELA_LIVEPATCH); + if (!klp_rsec) + return -1; + + klp_rsec->sh.sh_link = symtab->idx; + klp_rsec->sh.sh_info = sec->idx; + klp_rsec->base = sec; + } + + tmp = sec->rsec; + sec->rsec = klp_rsec; + if (!elf_create_reloc(elf, sec, offset, klp_sym, addend, klp_reloc->type)) + return -1; + sec->rsec = tmp; + + /* + * Fix up the corresponding KLP symbol: + */ + + klp_sym->sym.st_shndx = SHN_LIVEPATCH; + if (!gelf_update_sym(symtab->data, klp_sym->idx, &klp_sym->sym)) { + ERROR_ELF("gelf_update_sym"); + return -1; + } + + /* + * Disable the original non-KLP reloc by converting it to R_*_NONE: + */ + + reloc = find_reloc_by_dest(elf, sec, offset); + sym = reloc->sym; + sym->sym.st_shndx = SHN_LIVEPATCH; + set_reloc_type(elf, reloc, 0); + if (!gelf_update_sym(symtab->data, sym->idx, &sym->sym)) { + ERROR_ELF("gelf_update_sym"); + return -1; + } + } + + return 0; +} + +/* + * This runs on the livepatch module after all other linking has been done. It + * converts the intermediate __klp_relocs section into proper KLP relocs to be + * processed by livepatch. This needs to run last to avoid linker wreckage. + * Linkers don't tend to handle the "two rela sections for a single base + * section" case very well, nor do they appreciate SHN_LIVEPATCH. + */ +int cmd_klp_post_link(int argc, const char **argv) +{ + struct elf *elf; + + argc--; + argv++; + + if (argc != 1) { + fprintf(stderr, "%d\n", argc); + fprintf(stderr, "usage: objtool link <file.ko>\n"); + return -1; + } + + elf = elf_open_read(argv[0], O_RDWR); + if (!elf) + return -1; + + if (fix_klp_relocs(elf)) + return -1; + + if (elf_write(elf)) + return -1; + + return elf_close(elf); +} diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 802895fae3ca..14f8ab653449 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -36,6 +36,7 @@ NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) NORETURN(mwait_play_dead) +NORETURN(native_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(vpanic) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 5c8b974ad0f9..3c26ed561c7e 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -16,7 +16,8 @@ #include <objtool/objtool.h> #include <objtool/warn.h> -bool help; +bool debug; +int indent; static struct objtool_file file; @@ -71,6 +72,39 @@ int objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func) return 0; } +char *top_level_dir(const char *file) +{ + ssize_t len, self_len, file_len; + char self[PATH_MAX], *str; + int i; + + len = readlink("/proc/self/exe", self, sizeof(self) - 1); + if (len <= 0) + return NULL; + self[len] = '\0'; + + for (i = 0; i < 3; i++) { + char *s = strrchr(self, '/'); + if (!s) + return NULL; + *s = '\0'; + } + + self_len = strlen(self); + file_len = strlen(file); + + str = malloc(self_len + file_len + 2); + if (!str) + return NULL; + + memcpy(str, self, self_len); + str[self_len] = '/'; + strcpy(str + self_len + 1, file); + + return str; +} + + int main(int argc, const char **argv) { static const char *UNUSED = "OBJTOOL_NOT_IMPLEMENTED"; @@ -79,5 +113,11 @@ int main(int argc, const char **argv) exec_cmd_init("objtool", UNUSED, UNUSED, UNUSED); pager_init(UNUSED); + if (argc > 1 && !strcmp(argv[1], "klp")) { + argc--; + argv++; + return cmd_klp(argc, argv); + } + return objtool_run(argc, argv); } diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index 1dd9fc18fe62..5a979f52425a 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -8,7 +8,6 @@ #include <objtool/objtool.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> int orc_dump(const char *filename) { diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index 922e6aac7cea..1045e1380ffd 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -12,7 +12,6 @@ #include <objtool/check.h> #include <objtool/orc.h> #include <objtool/warn.h> -#include <objtool/endianness.h> struct orc_list_entry { struct list_head list; @@ -57,7 +56,7 @@ int orc_create(struct objtool_file *file) /* Build a deduplicated list of ORC entries: */ INIT_LIST_HEAD(&orc_list); - for_each_sec(file, sec) { + for_each_sec(file->elf, sec) { struct orc_entry orc, prev_orc = {0}; struct instruction *insn; bool empty = true; @@ -127,7 +126,11 @@ int orc_create(struct objtool_file *file) return -1; } orc_sec = elf_create_section(file->elf, ".orc_unwind", - sizeof(struct orc_entry), nr); + nr * sizeof(struct orc_entry), + sizeof(struct orc_entry), + SHT_PROGBITS, + 1, + SHF_ALLOC); if (!orc_sec) return -1; diff --git a/tools/objtool/special.c b/tools/objtool/special.c index c80fed8a840e..2a533afbc69a 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -15,7 +15,6 @@ #include <objtool/builtin.h> #include <objtool/special.h> #include <objtool/warn.h> -#include <objtool/endianness.h> struct special_entry { const char *sec; @@ -82,6 +81,8 @@ static int get_alt_entry(struct elf *elf, const struct special_entry *entry, entry->orig_len); alt->new_len = *(unsigned char *)(sec->data->d_buf + offset + entry->new_len); + alt->feature = *(unsigned int *)(sec->data->d_buf + offset + + entry->feature); } orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); @@ -133,7 +134,7 @@ int special_get_alts(struct elf *elf, struct list_head *alts) struct section *sec; unsigned int nr_entries; struct special_alt *alt; - int idx, ret; + int idx; INIT_LIST_HEAD(alts); @@ -142,12 +143,12 @@ int special_get_alts(struct elf *elf, struct list_head *alts) if (!sec) continue; - if (sec->sh.sh_size % entry->size != 0) { + if (sec_size(sec) % entry->size != 0) { ERROR("%s size not a multiple of %d", sec->name, entry->size); return -1; } - nr_entries = sec->sh.sh_size / entry->size; + nr_entries = sec_size(sec) / entry->size; for (idx = 0; idx < nr_entries; idx++) { alt = malloc(sizeof(*alt)); @@ -157,11 +158,8 @@ int special_get_alts(struct elf *elf, struct list_head *alts) } memset(alt, 0, sizeof(*alt)); - ret = get_alt_entry(elf, entry, sec, idx, alt); - if (ret > 0) - continue; - if (ret < 0) - return ret; + if (get_alt_entry(elf, entry, sec, idx, alt)) + return -1; list_add_tail(&alt->list, alts); } diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index 81d120d05442..e38167ca56a9 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -16,6 +16,8 @@ arch/x86/include/asm/orc_types.h arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk +include/linux/interval_tree_generic.h +include/linux/livepatch_external.h include/linux/static_call_types.h " diff --git a/tools/objtool/trace.c b/tools/objtool/trace.c new file mode 100644 index 000000000000..5dec44dab781 --- /dev/null +++ b/tools/objtool/trace.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025, Oracle and/or its affiliates. + */ + +#include <objtool/trace.h> + +bool trace; +int trace_depth; + +/* + * Macros to trace CFI state attributes changes. + */ + +#define TRACE_CFI_ATTR(attr, prev, next, fmt, ...) \ +({ \ + if ((prev)->attr != (next)->attr) \ + TRACE("%s=" fmt " ", #attr, __VA_ARGS__); \ +}) + +#define TRACE_CFI_ATTR_BOOL(attr, prev, next) \ + TRACE_CFI_ATTR(attr, prev, next, \ + "%s", (next)->attr ? "true" : "false") + +#define TRACE_CFI_ATTR_NUM(attr, prev, next, fmt) \ + TRACE_CFI_ATTR(attr, prev, next, fmt, (next)->attr) + +#define CFI_REG_NAME_MAXLEN 16 + +/* + * Return the name of a register. Note that the same static buffer + * is returned if the name is dynamically generated. + */ +static const char *cfi_reg_name(unsigned int reg) +{ + static char rname_buffer[CFI_REG_NAME_MAXLEN]; + const char *rname; + + switch (reg) { + case CFI_UNDEFINED: + return "<undefined>"; + case CFI_CFA: + return "cfa"; + case CFI_SP_INDIRECT: + return "(sp)"; + case CFI_BP_INDIRECT: + return "(bp)"; + } + + if (reg < CFI_NUM_REGS) { + rname = arch_reg_name[reg]; + if (rname) + return rname; + } + + if (snprintf(rname_buffer, CFI_REG_NAME_MAXLEN, "r%d", reg) == -1) + return "<error>"; + + return (const char *)rname_buffer; +} + +/* + * Functions and macros to trace CFI registers changes. + */ + +static void trace_cfi_reg(const char *prefix, int reg, const char *fmt, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + char *rname; + + if (base_prev == base_next && offset_prev == offset_next) + return; + + if (prefix) + TRACE("%s:", prefix); + + if (base_next == CFI_UNDEFINED) { + TRACE("%1$s=<undef> ", cfi_reg_name(reg)); + } else { + rname = strdup(cfi_reg_name(reg)); + TRACE(fmt, rname, cfi_reg_name(base_next), offset_next); + free(rname); + } +} + +static void trace_cfi_reg_val(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=%2$s%3$+d ", + base_prev, offset_prev, base_next, offset_next); +} + +static void trace_cfi_reg_ref(const char *prefix, int reg, + int base_prev, int offset_prev, + int base_next, int offset_next) +{ + trace_cfi_reg(prefix, reg, "%1$s=(%2$s%3$+d) ", + base_prev, offset_prev, base_next, offset_next); +} + +#define TRACE_CFI_REG_VAL(reg, prev, next) \ + trace_cfi_reg_val(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +#define TRACE_CFI_REG_REF(reg, prev, next) \ + trace_cfi_reg_ref(NULL, reg, prev.base, prev.offset, \ + next.base, next.offset) + +void trace_insn_state(struct instruction *insn, struct insn_state *sprev, + struct insn_state *snext) +{ + struct cfi_state *cprev, *cnext; + int i; + + if (!memcmp(sprev, snext, sizeof(struct insn_state))) + return; + + cprev = &sprev->cfi; + cnext = &snext->cfi; + + disas_print_insn(stderr, objtool_disas_ctx, insn, + trace_depth - 1, "state: "); + + /* print registers changes */ + TRACE_CFI_REG_VAL(CFI_CFA, cprev->cfa, cnext->cfa); + for (i = 0; i < CFI_NUM_REGS; i++) { + TRACE_CFI_REG_VAL(i, cprev->vals[i], cnext->vals[i]); + TRACE_CFI_REG_REF(i, cprev->regs[i], cnext->regs[i]); + } + + /* print attributes changes */ + TRACE_CFI_ATTR_NUM(stack_size, cprev, cnext, "%d"); + TRACE_CFI_ATTR_BOOL(drap, cprev, cnext); + if (cnext->drap) { + trace_cfi_reg_val("drap", cnext->drap_reg, + cprev->drap_reg, cprev->drap_offset, + cnext->drap_reg, cnext->drap_offset); + } + TRACE_CFI_ATTR_BOOL(bp_scratch, cprev, cnext); + TRACE_CFI_ATTR_NUM(instr, sprev, snext, "%d"); + TRACE_CFI_ATTR_NUM(uaccess_stack, sprev, snext, "%u"); + + TRACE("\n"); + + insn->trace = 1; +} + +void trace_alt_begin(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + struct instruction *alt_insn; + char suffix[2]; + + alt_insn = alt->insn; + + if (alt->type == ALT_TYPE_EX_TABLE) { + /* + * When there is an exception table then the instruction + * at the original location is executed but it can cause + * an exception. In that case, the execution will be + * redirected to the alternative instruction. + * + * The instruction at the original location can have + * instruction alternatives, so we just print the location + * of the instruction that can cause the exception and + * not the instruction itself. + */ + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s for instruction at 0x%lx <%s+0x%lx>", + alt_name, + orig_insn->offset, orig_insn->sym->name, + orig_insn->offset - orig_insn->sym->offset); + } else { + TRACE_ALT_INFO_NOADDR(orig_insn, "/ ", "%s", alt_name); + } + + if (alt->type == ALT_TYPE_JUMP_TABLE) { + /* + * For a jump alternative, if the default instruction is + * a NOP then it is replaced with the jmp instruction, + * otherwise it is replaced with a NOP instruction. + */ + trace_depth++; + if (orig_insn->type == INSN_NOP) { + suffix[0] = (orig_insn->len == 5) ? 'q' : '\0'; + TRACE_ADDR(orig_insn, "jmp%-3s %lx <%s+0x%lx>", suffix, + alt_insn->offset, alt_insn->sym->name, + alt_insn->offset - alt_insn->sym->offset); + } else { + TRACE_ADDR(orig_insn, "nop%d", orig_insn->len); + trace_depth--; + } + } +} + +void trace_alt_end(struct instruction *orig_insn, struct alternative *alt, + char *alt_name) +{ + if (alt->type == ALT_TYPE_JUMP_TABLE && orig_insn->type == INSN_NOP) + trace_depth--; + TRACE_ALT_INFO_NOADDR(orig_insn, "\\ ", "%s", alt_name); +} diff --git a/tools/objtool/weak.c b/tools/objtool/weak.c index d83f607733b0..d6562f292259 100644 --- a/tools/objtool/weak.c +++ b/tools/objtool/weak.c @@ -8,6 +8,8 @@ #include <stdbool.h> #include <errno.h> #include <objtool/objtool.h> +#include <objtool/arch.h> +#include <objtool/builtin.h> #define UNSUPPORTED(name) \ ({ \ @@ -24,3 +26,8 @@ int __weak orc_create(struct objtool_file *file) { UNSUPPORTED("ORC"); } + +int __weak cmd_klp(int argc, const char **argv) +{ + UNSUPPORTED("klp"); +} diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5700516aa84a..2dd5f5a60568 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -354,9 +354,6 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_LDFLAGS-libaio = -lrt -FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl -FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl - CORE_CFLAGS += -fno-omit-frame-pointer CORE_CFLAGS += -Wall CORE_CFLAGS += -Wextra @@ -930,6 +927,8 @@ ifdef BUILD_NONDISTRO ifeq ($(feature-libbfd), 1) EXTLIBS += -lbfd -lopcodes + FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl + FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl else # we are on a system that requires -liberty and (maybe) -lz # to link against -lbfd; test each case individually here diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 47c906b807ef..02f87c49801f 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -234,12 +234,12 @@ endif # The fixdep build - we force fixdep tool to be built as # the first target in the separate make session not to be # disturbed by any parallel make jobs. Once fixdep is done -# we issue the requested build with FIXDEP=1 variable. +# we issue the requested build with FIXDEP_BUILT=1 variable. # # The fixdep build is disabled for $(NON_CONFIG_TARGETS) # targets, because it's not necessary. -ifdef FIXDEP +ifdef FIXDEP_BUILT force_fixdep := 0 else force_fixdep := $(config) @@ -286,7 +286,7 @@ $(goals) all: sub-make sub-make: fixdep @./check-headers.sh - $(Q)$(MAKE) FIXDEP=1 -f Makefile.perf $(goals) + $(Q)$(MAKE) FIXDEP_BUILT=1 -f Makefile.perf $(goals) else # force_fixdep diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291e..ced2a1deecd7 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 078634461df2..e8962c985d34 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -1867,6 +1867,7 @@ static int __cmd_report(bool display_info) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.namespaces = perf_event__process_namespaces; eops.tracing_data = perf_event__process_tracing_data; session = perf_session__new(&data, &eops); @@ -2023,6 +2024,7 @@ static int __cmd_contention(int argc, const char **argv) eops.sample = process_sample_event; eops.comm = perf_event__process_comm; eops.mmap = perf_event__process_mmap; + eops.mmap2 = perf_event__process_mmap2; eops.tracing_data = perf_event__process_tracing_data; perf_env__init(&host_env); diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh index 7248a74ca2a3..6dd90519f45c 100755 --- a/tools/perf/tests/shell/lock_contention.sh +++ b/tools/perf/tests/shell/lock_contention.sh @@ -13,15 +13,18 @@ cleanup() { rm -f ${perfdata} rm -f ${result} rm -f ${errout} - trap - EXIT TERM INT + trap - EXIT TERM INT ERR } trap_cleanup() { + if (( $? == 139 )); then #SIGSEGV + err=1 + fi echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit ${err} } -trap trap_cleanup EXIT TERM INT +trap trap_cleanup EXIT TERM INT ERR check() { if [ "$(id -u)" != 0 ]; then @@ -145,7 +148,7 @@ test_aggr_cgroup() fi # the perf lock contention output goes to the stderr - perf lock con -a -b -g -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)" err=1 @@ -271,7 +274,7 @@ test_cgroup_filter() return fi - perf lock con -a -b -g -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a cgroup result:" "$(cat "${result}")" err=1 @@ -279,7 +282,7 @@ test_cgroup_filter() fi cgroup=$(cat "${result}" | awk '{ print $3 }') - perf lock con -a -b -g -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} + perf lock con -a -b --lock-cgroup -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] BPF result should have a result with cgroup filter:" "$(cat "${cgroup}")" err=1 @@ -338,4 +341,5 @@ test_aggr_task_stack_filter test_cgroup_filter test_csv_output +cleanup exit ${err} diff --git a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h index f291ab4f94eb..3741ea1b73d8 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fcntl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index ed3aed264aeb..51c4e8c82b1e 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE, MADV_COLLAPSE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4f2a6e10ed5c..4e12be579140 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1022,12 +1022,9 @@ static int write_bpf_prog_info(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.infos_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.infos_cnt, sizeof(env->bpf_progs.infos_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.infos_cnt == 0) goto out; root = &env->bpf_progs.infos; @@ -1067,13 +1064,10 @@ static int write_bpf_btf(struct feat_fd *ff, down_read(&env->bpf_progs.lock); - if (env->bpf_progs.btfs_cnt == 0) - goto out; - ret = do_write(ff, &env->bpf_progs.btfs_cnt, sizeof(env->bpf_progs.btfs_cnt)); - if (ret < 0) + if (ret < 0 || env->bpf_progs.btfs_cnt == 0) goto out; root = &env->bpf_progs.btfs; diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c index 01147fbf73b3..6434c2dccd4a 100644 --- a/tools/perf/util/libbfd.c +++ b/tools/perf/util/libbfd.c @@ -38,6 +38,39 @@ struct a2l_data { asymbol **syms; }; +static bool perf_bfd_lock(void *bfd_mutex) +{ + mutex_lock(bfd_mutex); + return true; +} + +static bool perf_bfd_unlock(void *bfd_mutex) +{ + mutex_unlock(bfd_mutex); + return true; +} + +static void perf_bfd_init(void) +{ + static struct mutex bfd_mutex; + + mutex_init_recursive(&bfd_mutex); + + if (bfd_init() != BFD_INIT_MAGIC) { + pr_err("Error initializing libbfd\n"); + return; + } + if (!bfd_thread_init(perf_bfd_lock, perf_bfd_unlock, &bfd_mutex)) + pr_err("Error initializing libbfd threading\n"); +} + +static void ensure_bfd_init(void) +{ + static pthread_once_t bfd_init_once = PTHREAD_ONCE_INIT; + + pthread_once(&bfd_init_once, perf_bfd_init); +} + static int bfd_error(const char *string) { const char *errmsg; @@ -132,6 +165,7 @@ static struct a2l_data *addr2line_init(const char *path) bfd *abfd; struct a2l_data *a2l = NULL; + ensure_bfd_init(); abfd = bfd_openr(path, NULL); if (abfd == NULL) return NULL; @@ -288,6 +322,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) bfd *abfd; u64 start, len; + ensure_bfd_init(); abfd = bfd_openr(debugfile, NULL); if (!abfd) return -1; @@ -393,6 +428,7 @@ int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block if (fd < 0) return -1; + ensure_bfd_init(); abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); if (!abfd) return -1; @@ -421,6 +457,7 @@ int libbfd_filename__read_debuglink(const char *filename, char *debuglink, asection *section; bfd *abfd; + ensure_bfd_init(); abfd = bfd_openr(filename, NULL); if (!abfd) return -1; @@ -480,6 +517,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, memset(tpath, 0, sizeof(tpath)); perf_exe(tpath, sizeof(tpath)); + ensure_bfd_init(); bfdf = bfd_openr(tpath, NULL); if (bfdf == NULL) abort(); diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c index bca7f0717f35..7aa1f3f55a7d 100644 --- a/tools/perf/util/mutex.c +++ b/tools/perf/util/mutex.c @@ -17,7 +17,7 @@ static void check_err(const char *fn, int err) #define CHECK_ERR(err) check_err(__func__, err) -static void __mutex_init(struct mutex *mtx, bool pshared) +static void __mutex_init(struct mutex *mtx, bool pshared, bool recursive) { pthread_mutexattr_t attr; @@ -27,21 +27,27 @@ static void __mutex_init(struct mutex *mtx, bool pshared) /* In normal builds enable error checking, such as recursive usage. */ CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK)); #endif + if (recursive) + CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)); if (pshared) CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)); - CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr)); CHECK_ERR(pthread_mutexattr_destroy(&attr)); } void mutex_init(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/false); + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/false); } void mutex_init_pshared(struct mutex *mtx) { - __mutex_init(mtx, /*pshared=*/true); + __mutex_init(mtx, /*pshared=*/true, /*recursive=*/false); +} + +void mutex_init_recursive(struct mutex *mtx) +{ + __mutex_init(mtx, /*pshared=*/false, /*recursive=*/true); } void mutex_destroy(struct mutex *mtx) diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h index 38458f00846f..70232d8d094f 100644 --- a/tools/perf/util/mutex.h +++ b/tools/perf/util/mutex.h @@ -104,6 +104,8 @@ void mutex_init(struct mutex *mtx); * process-private attribute. */ void mutex_init_pshared(struct mutex *mtx); +/* Initializes a mutex that may be recursively held on the same thread. */ +void mutex_init_recursive(struct mutex *mtx); void mutex_destroy(struct mutex *mtx); void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx); diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index cc26b7bf302b..948d3e8ad782 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -112,9 +112,13 @@ static bool symbol_type__filter(char __symbol_type) // 'N' first seen in: // ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ // a seemingly Rust mangled name + // Ditto for '1': + // root@x1:~# grep ' 1 ' /proc/kallsyms + // ffffffffb098bc00 1 __pfx__RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ + // ffffffffb098bc10 1 _RNCINvNtNtNtCsfwaGRd4cjqE_4core4iter8adapters3map12map_try_foldjNtCskFudTml27HW_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ char symbol_type = toupper(__symbol_type); return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' || - __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N'; + __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N' || __symbol_type == '1'; } static int prefix_underscores_count(const char *str) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653e..f2a2fd236ca8 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y +CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y @@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y CONFIG_RC_CORE=y +CONFIG_SAMPLES=y +CONFIG_SAMPLE_LIVEPATCH=m CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_SYN_COOKIES=y diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c new file mode 100644 index 000000000000..72aa5376c30e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <test_progs.h> +#include "testing_helpers.h" +#include "livepatch_trampoline.skel.h" + +static int load_livepatch(void) +{ + char path[4096]; + + /* CI will set KBUILD_OUTPUT */ + snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko", + getenv("KBUILD_OUTPUT") ? : "../../../.."); + + return load_module(path, env_verbosity > VERBOSE_NONE); +} + +static void unload_livepatch(void) +{ + /* Disable the livepatch before unloading the module */ + system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled"); + + unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE); +} + +static void read_proc_cmdline(void) +{ + char buf[4096]; + int fd, ret; + + fd = open("/proc/cmdline", O_RDONLY); + if (!ASSERT_OK_FD(fd, "open /proc/cmdline")) + return; + + ret = read(fd, buf, sizeof(buf)); + if (!ASSERT_GT(ret, 0, "read /proc/cmdline")) + goto out; + + ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp"); + +out: + close(fd); +} + +static void __test_livepatch_trampoline(bool fexit_first) +{ + struct livepatch_trampoline *skel = NULL; + int err; + + skel = livepatch_trampoline__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + goto out; + + skel->bss->my_pid = getpid(); + + if (!fexit_first) { + /* fentry program is loaded first by default */ + err = livepatch_trampoline__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto out; + } else { + /* Manually load fexit program first. */ + skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline); + if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit")) + goto out; + + skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline); + if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry")) + goto out; + } + + read_proc_cmdline(); + + ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit"); + ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit"); +out: + livepatch_trampoline__destroy(skel); +} + +void test_livepatch_trampoline(void) +{ + int retry_cnt = 0; + +retry: + if (load_livepatch()) { + if (retry_cnt) { + ASSERT_OK(1, "load_livepatch"); + goto out; + } + /* + * Something else (previous run of the same test?) loaded + * the KLP module. Unload the KLP module and retry. + */ + unload_livepatch(); + retry_cnt++; + goto retry; + } + + if (test__start_subtest("fentry_first")) + __test_livepatch_trampoline(false); + + if (test__start_subtest("fexit_first")) + __test_livepatch_trampoline(true); +out: + unload_livepatch(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c index f8eb7f9d4fd2..8fade8bdc451 100644 --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c @@ -6,11 +6,13 @@ #include <netinet/in.h> #include <test_progs.h> #include <unistd.h> +#include <errno.h> #include "cgroup_helpers.h" #include "network_helpers.h" #include "mptcp_sock.skel.h" #include "mptcpify.skel.h" #include "mptcp_subflow.skel.h" +#include "mptcp_sockmap.skel.h" #define NS_TEST "mptcp_ns" #define ADDR_1 "10.0.1.1" @@ -436,6 +438,142 @@ close_cgroup: close(cgroup_fd); } +/* Test sockmap on MPTCP server handling non-mp-capable clients. */ +static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, client_fd1 = -1, client_fd2 = -1; + int server_fd1 = -1, server_fd2 = -1, sent, recvd; + char snd[9] = "123456789"; + char rcv[10]; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client without MPTCP enabled */ + client_fd1 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd1 = accept(listen_fd, NULL, 0); + skel->bss->sk_index = 1; + client_fd2 = connect_to_fd_opts(listen_fd, NULL); + if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd")) + goto end; + + server_fd2 = accept(listen_fd, NULL, 0); + /* test normal redirect behavior: data sent by client_fd1 can be + * received by client_fd2 + */ + skel->bss->redirect_idx = 1; + sent = send(client_fd1, snd, sizeof(snd), 0); + if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)")) + goto end; + + /* try to recv more bytes to avoid truncation check */ + recvd = recv(client_fd2, rcv, sizeof(rcv), 0); + if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)")) + goto end; + +end: + if (client_fd1 >= 0) + close(client_fd1); + if (client_fd2 >= 0) + close(client_fd2); + if (server_fd1 >= 0) + close(server_fd1); + if (server_fd2 >= 0) + close(server_fd2); + close(listen_fd); +} + +/* Test sockmap rejection of MPTCP sockets - both server and client sides. */ +static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel) +{ + int listen_fd = -1, server_fd = -1, client_fd1 = -1; + int err, zero = 0; + + /* start server with MPTCP enabled */ + listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0); + if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server")) + return; + + skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd)); + skel->bss->sk_index = 0; + /* create client with MPTCP enabled */ + client_fd1 = connect_to_fd(listen_fd, 0); + if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1")) + goto end; + + /* bpf_sock_map_update() called from sockops should reject MPTCP sk */ + if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject")) + goto end; + + server_fd = accept(listen_fd, NULL, 0); + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &server_fd, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed")) + goto end; + + /* MPTCP client should also be disallowed */ + err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map), + &zero, &client_fd1, BPF_NOEXIST); + if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed")) + goto end; +end: + if (client_fd1 >= 0) + close(client_fd1); + if (server_fd >= 0) + close(server_fd); + close(listen_fd); +} + +static void test_mptcp_sockmap(void) +{ + struct mptcp_sockmap *skel; + struct netns_obj *netns; + int cgroup_fd, err; + + cgroup_fd = test__join_cgroup("/mptcp_sockmap"); + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap")) + return; + + skel = mptcp_sockmap__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap")) + goto close_cgroup; + + skel->links.mptcp_sockmap_inject = + bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap")) + goto skel_destroy; + + err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect), + bpf_map__fd(skel->maps.sock_map), + BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto skel_destroy; + + netns = netns_new(NS_TEST, true); + if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap")) + goto skel_destroy; + + if (endpoint_init("subflow") < 0) + goto close_netns; + + test_sockmap_with_mptcp_fallback(skel); + test_sockmap_reject_mptcp(skel); + +close_netns: + netns_free(netns); +skel_destroy: + mptcp_sockmap__destroy(skel); +close_cgroup: + close(cgroup_fd); +} + void test_mptcp(void) { if (test__start_subtest("base")) @@ -444,4 +582,6 @@ void test_mptcp(void) test_mptcpify(); if (test__start_subtest("subflow")) test_subflow(); + if (test__start_subtest("sockmap")) + test_mptcp_sockmap(); } diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c new file mode 100644 index 000000000000..c9efdd2a5b18 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include "stacktrace_ips.skel.h" + +#ifdef __x86_64__ +static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...) +{ + __u64 ips[PERF_MAX_STACK_DEPTH]; + struct ksyms *ksyms = NULL; + int i, err = 0; + va_list args; + + /* sorted by addr */ + ksyms = load_kallsyms_local(); + if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local")) + return -1; + + /* unlikely, but... */ + if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max")) + return -1; + + err = bpf_map_lookup_elem(fd, &key, ips); + if (err) + goto out; + + /* + * Compare all symbols provided via arguments with stacktrace ips, + * and their related symbol addresses.t + */ + va_start(args, cnt); + + for (i = 0; i < cnt; i++) { + unsigned long val; + struct ksym *ksym; + + val = va_arg(args, unsigned long); + ksym = ksym_search_local(ksyms, ips[i]); + if (!ASSERT_OK_PTR(ksym, "ksym_search_local")) + break; + ASSERT_EQ(ksym->addr, val, "stack_cmp"); + } + + va_end(args); + +out: + free_kallsyms_local(ksyms); + return err; +} + +static void test_stacktrace_ips_kprobe_multi(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts( + skel->progs.kprobe_multi_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void test_stacktrace_ips_raw_tp(void) +{ + __u32 info_len = sizeof(struct bpf_prog_info); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct bpf_prog_info info = {}; + struct stacktrace_ips *skel; + __u64 bpf_prog_ksym = 0; + int err; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.rawtp_test = bpf_program__attach_raw_tracepoint( + skel->progs.rawtp_test, + "bpf_testmod_test_read"); + if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint")) + goto cleanup; + + /* get bpf program address */ + info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym); + info.nr_jited_ksyms = 1; + err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test), + &info, &info_len); + if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2, + bpf_prog_ksym, + ksym_get_addr("bpf_trace_run2")); + +cleanup: + stacktrace_ips__destroy(skel); +} + +static void __test_stacktrace_ips(void) +{ + if (test__start_subtest("kprobe_multi")) + test_stacktrace_ips_kprobe_multi(false); + if (test__start_subtest("kretprobe_multi")) + test_stacktrace_ips_kprobe_multi(true); + if (test__start_subtest("raw_tp")) + test_stacktrace_ips_raw_tp(); +} +#else +static void __test_stacktrace_ips(void) +{ + test__skip(); +} +#endif + +void test_stacktrace_ips(void) +{ + __test_stacktrace_ips(); +} diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c index 05fa5ce7fc59..d00fd570255a 100644 --- a/tools/testing/selftests/bpf/progs/iters_looping.c +++ b/tools/testing/selftests/bpf/progs/iters_looping.c @@ -161,3 +161,56 @@ int simplest_loop(void *ctx) return 0; } + +__used +static void iterator_with_diff_stack_depth(int x) +{ + struct bpf_iter_num iter; + + asm volatile ( + "if r1 == 42 goto 0f;" + "*(u64 *)(r10 - 128) = 0;" + "0:" + /* create iterator */ + "r1 = %[iter];" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "1:" + /* consume next item */ + "r1 = %[iter];" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto 2f;" + "goto 1b;" + "2:" + /* destroy iterator */ + "r1 = %[iter];" + "call %[bpf_iter_num_destroy];" + : + : __imm_ptr(iter), ITER_HELPERS + : __clobber_common, "r6" + ); +} + +SEC("socket") +__success +__naked int widening_stack_size_bug(void *ctx) +{ + /* + * Depending on iterator_with_diff_stack_depth() parameter value, + * subprogram stack depth is either 8 or 128 bytes. Arrange values so + * that it is 128 on a first call and 8 on a second. This triggered a + * bug in verifier's widen_imprecise_scalars() logic. + */ + asm volatile ( + "r6 = 0;" + "r1 = 0;" + "1:" + "call iterator_with_diff_stack_depth;" + "r1 = 42;" + "r6 += 1;" + "if r6 < 2 goto 1b;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c new file mode 100644 index 000000000000..15579d5bcd91 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +int fentry_hit; +int fexit_hit; +int my_pid; + +SEC("fentry/cmdline_proc_show") +int BPF_PROG(fentry_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fentry_hit = 1; + return 0; +} + +SEC("fexit/cmdline_proc_show") +int BPF_PROG(fexit_cmdline) +{ + if (my_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + fexit_hit = 1; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c new file mode 100644 index 000000000000..d4eef0cbadb9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bpf_tracing_net.h" + +char _license[] SEC("license") = "GPL"; + +int sk_index; +int redirect_idx; +int trace_port; +int helper_ret; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 100); +} sock_map SEC(".maps"); + +SEC("sockops") +int mptcp_sockmap_inject(struct bpf_sock_ops *skops) +{ + struct bpf_sock *sk; + + /* only accept specified connection */ + if (skops->local_port != trace_port || + skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) + return 1; + + sk = skops->sk; + if (!sk) + return 1; + + /* update sk handler */ + helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST); + + return 1; +} + +SEC("sk_skb/stream_verdict") +int mptcp_sockmap_redirect(struct __sk_buff *skb) +{ + /* redirect skb to the sk under sock_map[redirect_idx] */ + return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0); +} diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c new file mode 100644 index 000000000000..a96c8150d7f5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <vmlinux.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#ifndef PERF_MAX_STACK_DEPTH +#define PERF_MAX_STACK_DEPTH 127 +#endif + +typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH]; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(max_entries, 16384); + __type(key, __u32); + __type(value, stack_trace_t); +} stackmap SEC(".maps"); + +extern bool CONFIG_UNWINDER_ORC __kconfig __weak; + +/* + * This function is here to have CONFIG_UNWINDER_ORC + * used and added to object BTF. + */ +int unused(void) +{ + return CONFIG_UNWINDER_ORC ? 0 : 1; +} + +__u32 stack_key; + +SEC("kprobe.multi") +int kprobe_multi_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +SEC("raw_tp/bpf_testmod_test_read") +int rawtp_test(void *ctx) +{ + /* Skip ebpf program entry in the stack. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index b4a0d0cc8ec8..3662515f0107 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 23217f06a3ec..663a80990f8f 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args) if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); return 0; } @@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 77fe8f28facd..1270953fd092 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 90fca06fff56..55e555f7f41b 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work, NULL); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 8074bc5f6f20..ed0a4721d8fd 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d, return a + (long)b + c + d + (long)e + f + g + h + i + j + k; } +noinline void bpf_testmod_stacktrace_test(void) +{ + /* used for stacktrace test as attach function */ + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_3(void) +{ + bpf_testmod_stacktrace_test(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_2(void) +{ + bpf_testmod_stacktrace_test_3(); + asm volatile (""); +} + +noinline void bpf_testmod_stacktrace_test_1(void) +{ + bpf_testmod_stacktrace_test_2(); + asm volatile (""); +} + int bpf_testmod_fentry_ok; noinline ssize_t @@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj, 21, 22, 23, 24, 25, 26) != 231) goto out; + bpf_testmod_stacktrace_test_1(); + bpf_testmod_fentry_ok = 1; out: return -EIO; /* always fail */ diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore new file mode 100644 index 000000000000..097f52db0be9 --- /dev/null +++ b/tools/testing/selftests/coredump/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +stackdump_test +coredump_socket_test +coredump_socket_protocol_test diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile index 77b3665c73c7..dece1a31d561 100644 --- a/tools/testing/selftests/coredump/Makefile +++ b/tools/testing/selftests/coredump/Makefile @@ -1,7 +1,13 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := stackdump_test +TEST_GEN_PROGS := stackdump_test \ + coredump_socket_test \ + coredump_socket_protocol_test TEST_FILES := stackdump include ../lib.mk + +$(OUTPUT)/stackdump_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c +$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c new file mode 100644 index 000000000000..d19b6717c53e --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c @@ -0,0 +1,1568 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +#define NUM_CRASHING_COREDUMPS 5 + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket_request_kernel) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_kernel: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); + system("file /tmp/coredump.file"); +} + +TEST_F(coredump, socket_request_userspace) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_userspace: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_reject) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_reject: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_reject: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_reject: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read > 0) { + fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n"); + goto out; + } + + if (bytes_read < 0) { + fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_reject: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_flag_combination) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) { + fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_unknown_flag) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) { + fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) { + fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_unknown_flag: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_small) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 / 2)) { + fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) { + fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_request_invalid_size_large) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n"); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, + COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) { + fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT + * + * Verify that when using socket-based coredump protocol, + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + struct coredump_req req = {}; + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n"); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_REJECT | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n"); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + struct coredump_req req = {}; + + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "Failed to create and listen on unix socket\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "Failed to notify parent via ipc socket\n"); + goto out; + } + close(ipc_sockets[1]); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); + goto out; + } + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); + goto out; + } + + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); + goto out; + } + + if (!send_coredump_ack(fd_coredump, &req, + COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); + goto out; + } + + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); + goto out; + } + } + + close(fd_core_file); + close(fd_peer_pidfd); + close(fd_coredump); + fd_peer_pidfd = -1; + fd_coredump = -1; + } + + exit_code = EXIT_SUCCESS; +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + waitpid(pid[i], &status[i], 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) +{ + int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; + pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; + fd_server = -1; + exit_code = EXIT_FAILURE; + n_conns = 0; + close(ipc_sockets[0]); + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n"); + goto out; + } + close(ipc_sockets[1]); + + while (n_conns < NUM_CRASHING_COREDUMPS) { + int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + struct coredump_req req = {}; + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + continue; + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n"); + goto out; + } + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n"); + goto out; + } + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n"); + goto out; + } + if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n"); + goto out; + } + if (!read_coredump_req(fd_coredump, &req)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n"); + goto out; + } + if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, + COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n"); + goto out; + } + if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n"); + goto out; + } + if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n"); + goto out; + } + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n"); + goto out; + } + pid_t worker = fork(); + if (worker == 0) { + close(fd_server); + process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); + } + worker_pids[n_conns] = worker; + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_core_file >= 0) + close(fd_core_file); + n_conns++; + } + exit_code = EXIT_SUCCESS; +out: + if (fd_server >= 0) + close(fd_server); + + // Reap all worker processes + for (int i = 0; i < n_conns; i++) { + int wstatus; + if (waitpid(worker_pids[i], &wstatus, 0) < 0) { + fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); + } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { + fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); + exit_code = EXIT_FAILURE; + } + } + + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + pid[i] = fork(); + ASSERT_GE(pid[i], 0); + if (pid[i] == 0) + crashing_child(); + pidfd[i] = sys_pidfd_open(pid[i], 0); + ASSERT_GE(pidfd[i], 0); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); + ASSERT_TRUE(WIFSIGNALED(status[i])); + ASSERT_TRUE(WCOREDUMP(status[i])); + } + + for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { + info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; + ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + } + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c new file mode 100644 index 000000000000..7e26d4a6a15d --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_socket_test.c @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/stat.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "coredump_test.h" + +FIXTURE_SETUP(coredump) +{ + FILE *file; + int ret; + + self->pid_coredump_server = -ESRCH; + self->fd_tmpfs_detached = -1; + file = fopen("/proc/sys/kernel/core_pattern", "r"); + ASSERT_NE(NULL, file); + + ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file); + ASSERT_TRUE(ret || feof(file)); + ASSERT_LT(ret, sizeof(self->original_core_pattern)); + + self->original_core_pattern[ret] = '\0'; + self->fd_tmpfs_detached = create_detached_tmpfs(); + ASSERT_GE(self->fd_tmpfs_detached, 0); + + ret = fclose(file); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(coredump) +{ + const char *reason; + FILE *file; + int ret, status; + + if (self->pid_coredump_server > 0) { + kill(self->pid_coredump_server, SIGTERM); + waitpid(self->pid_coredump_server, &status, 0); + } + unlink("/tmp/coredump.file"); + unlink("/tmp/coredump.socket"); + + file = fopen("/proc/sys/kernel/core_pattern", "w"); + if (!file) { + reason = "Unable to open core_pattern"; + goto fail; + } + + ret = fprintf(file, "%s", self->original_core_pattern); + if (ret < 0) { + reason = "Unable to write to core_pattern"; + goto fail; + } + + ret = fclose(file); + if (ret) { + reason = "Unable to close core_pattern"; + goto fail; + } + + if (self->fd_tmpfs_detached >= 0) { + ret = close(self->fd_tmpfs_detached); + if (ret < 0) { + reason = "Unable to close detached tmpfs"; + goto fail; + } + self->fd_tmpfs_detached = -1; + } + + return; +fail: + /* This should never happen */ + fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason); +} + +TEST_F(coredump, socket) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket test: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket test: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket test: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + fd_core_file = creat("/tmp/coredump.file", 0644); + if (fd_core_file < 0) { + fprintf(stderr, "socket test: creat coredump file failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket test: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket test: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); + ASSERT_GT(st.st_size, 0); +} + +TEST_F(coredump, socket_detect_userspace_client) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct stat st; + struct pidfd_info info = { + .mask = PIDFD_INFO_COREDUMP, + }; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (info.coredump_mask & PIDFD_COREDUMPED) { + fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_detect_userspace_client: completed successfully\n"); +out: + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + int fd_socket; + ssize_t ret; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = + offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd_socket < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n"); + _exit(EXIT_FAILURE); + } + + ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n"); + _exit(EXIT_FAILURE); + } + + close(fd_socket); + pause(); + fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n"); + _exit(EXIT_SUCCESS); + } + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); + ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); + + ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0); + ASSERT_EQ(close(pidfd), 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + ASSERT_NE(stat("/tmp/coredump.file", &st), 0); + ASSERT_EQ(errno, ENOENT); +} + +TEST_F(coredump, socket_enoent) +{ + int pidfd, status; + pid_t pid; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); +} + +TEST_F(coredump, socket_no_listener) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + int ipc_sockets[2]; + char c; + const struct sockaddr_un coredump_sk = { + .sun_family = AF_UNIX, + .sun_path = "/tmp/coredump.socket", + }; + size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + + sizeof("/tmp/coredump.socket"); + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd_server < 0) { + fprintf(stderr, "socket_no_listener: socket failed: %m\n"); + goto out; + } + + ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); + if (ret < 0) { + fprintf(stderr, "socket_no_listener: bind failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_no_listener: completed successfully\n"); +out: + if (fd_server >= 0) + close(fd_server); + close(ipc_sockets[1]); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_FALSE(WCOREDUMP(status)); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGSEGV. + */ +TEST_F(coredump, socket_coredump_signal_sigsegv) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGSEGV) { + fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n", + info.coredump_signal, SIGSEGV); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + crashing_child(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGSEGV); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGSEGV); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +/* + * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT + * + * Verify that when using simple socket-based coredump (@ pattern), + * the coredump_signal field is correctly exposed as SIGABRT. + */ +TEST_F(coredump, socket_coredump_signal_sigabrt) +{ + int pidfd, ret, status; + pid_t pid, pid_coredump_server; + struct pidfd_info info = {}; + int ipc_sockets[2]; + char c; + + ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); + + ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_EQ(ret, 0); + + pid_coredump_server = fork(); + ASSERT_GE(pid_coredump_server, 0); + if (pid_coredump_server == 0) { + int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; + int exit_code = EXIT_FAILURE; + + close(ipc_sockets[0]); + + fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); + if (fd_server < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n"); + goto out; + } + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n"); + goto out; + } + + close(ipc_sockets[1]); + + fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); + if (fd_coredump < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n"); + goto out; + } + + fd_peer_pidfd = get_peer_pidfd(fd_coredump); + if (fd_peer_pidfd < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n"); + goto out; + } + + if (!get_pidfd_info(fd_peer_pidfd, &info)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n"); + goto out; + } + + if (!(info.mask & PIDFD_INFO_COREDUMP)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n"); + goto out; + } + + if (!(info.coredump_mask & PIDFD_COREDUMPED)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n"); + goto out; + } + + /* Verify coredump_signal is available and correct */ + if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) { + fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n"); + goto out; + } + + if (info.coredump_signal != SIGABRT) { + fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n", + info.coredump_signal, SIGABRT); + goto out; + } + + fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); + if (fd_core_file < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n"); + goto out; + } + + for (;;) { + char buffer[4096]; + ssize_t bytes_read, bytes_write; + + bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n"); + goto out; + } + + if (bytes_read == 0) + break; + + bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_read != bytes_write) { + fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + + exit_code = EXIT_SUCCESS; + fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n"); +out: + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + if (fd_server >= 0) + close(fd_server); + _exit(exit_code); + } + self->pid_coredump_server = pid_coredump_server; + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) + abort(); + + pidfd = sys_pidfd_open(pid, 0); + ASSERT_GE(pidfd, 0); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGABRT); + ASSERT_TRUE(WCOREDUMP(status)); + + ASSERT_TRUE(get_pidfd_info(pidfd, &info)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)); + ASSERT_EQ(info.coredump_signal, SIGABRT); + + wait_and_check_coredump_server(pid_coredump_server, _metadata, self); +} + +TEST_F(coredump, socket_invalid_paths) +{ + ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@..")); + + ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); + ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); + ASSERT_FALSE(set_core_pattern("@@..")); + + ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h new file mode 100644 index 000000000000..ed47f01fa53c --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __COREDUMP_TEST_H +#define __COREDUMP_TEST_H + +#include <stdbool.h> +#include <sys/types.h> +#include <linux/coredump.h> + +#include "../kselftest_harness.h" +#include "../pidfd/pidfd.h" + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +/* Coredump fixture */ +FIXTURE(coredump) +{ + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +/* Shared helper function declarations */ +void *do_nothing(void *arg); +void crashing_child(void); +int create_detached_tmpfs(void); +int create_and_listen_unix_socket(const char *path); +bool set_core_pattern(const char *pattern); +int get_peer_pidfd(int fd); +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info); + +/* Inline helper that uses harness types */ +static inline void wait_and_check_coredump_server(pid_t pid_coredump_server, + struct __test_metadata *const _metadata, + FIXTURE_DATA(coredump) *self) +{ + int status; + waitpid(pid_coredump_server, &status, 0); + self->pid_coredump_server = -ESRCH; + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +/* Protocol helper function declarations */ +ssize_t recv_marker(int fd); +bool read_marker(int fd, enum coredump_mark mark); +bool read_coredump_req(int fd, struct coredump_req *req); +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack); +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask); +int open_coredump_tmpfile(int fd_tmpfs_detached); +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file); + +#endif /* __COREDUMP_TEST_H */ diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c new file mode 100644 index 000000000000..a6f6d5f2ae07 --- /dev/null +++ b/tools/testing/selftests/coredump/coredump_test_helpers.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <pthread.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "../filesystems/wrappers.h" +#include "../pidfd/pidfd.h" + +/* Forward declarations to avoid including harness header */ +struct __test_metadata; + +/* Match the fixture definition from coredump_test.h */ +struct _fixture_coredump_data { + char original_core_pattern[256]; + pid_t pid_coredump_server; + int fd_tmpfs_detached; +}; + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define NUM_THREAD_SPAWN 128 + +void *do_nothing(void *arg) +{ + (void)arg; + while (1) + pause(); + + return NULL; +} + +void crashing_child(void) +{ + pthread_t thread; + int i; + + for (i = 0; i < NUM_THREAD_SPAWN; ++i) + pthread_create(&thread, NULL, do_nothing, NULL); + + /* crash on purpose */ + i = *(int *)NULL; +} + +int create_detached_tmpfs(void) +{ + int fd_context, fd_tmpfs; + + fd_context = sys_fsopen("tmpfs", 0); + if (fd_context < 0) + return -1; + + if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) + return -1; + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + close(fd_context); + return fd_tmpfs; +} + +int create_and_listen_unix_socket(const char *path) +{ + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + assert(strlen(path) < sizeof(addr.sun_path) - 1); + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + size_t addr_len = + offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; + int fd, ret; + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (fd < 0) + goto out; + + ret = bind(fd, (const struct sockaddr *)&addr, addr_len); + if (ret < 0) + goto out; + + ret = listen(fd, 128); + if (ret < 0) + goto out; + + return fd; + +out: + if (fd >= 0) + close(fd); + return -1; +} + +bool set_core_pattern(const char *pattern) +{ + int fd; + ssize_t ret; + + fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); + if (fd < 0) + return false; + + ret = write(fd, pattern, strlen(pattern)); + close(fd); + if (ret < 0) + return false; + + fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); + return ret == strlen(pattern); +} + +int get_peer_pidfd(int fd) +{ + int fd_peer_pidfd; + socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); + int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, + &fd_peer_pidfd_len); + if (ret < 0) { + fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n"); + return -1; + } + fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd); + return fd_peer_pidfd; +} + +bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) +{ + int ret; + memset(info, 0, sizeof(*info)); + info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; + ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info); + if (ret < 0) { + fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n"); + return false; + } + fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n", + (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal); + return true; +} + +/* Protocol helper functions */ + +ssize_t recv_marker(int fd) +{ + enum coredump_mark mark = COREDUMP_MARK_REQACK; + ssize_t ret; + + ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); + if (ret != sizeof(mark)) + return -1; + + switch (mark) { + case COREDUMP_MARK_REQACK: + fprintf(stderr, "Received marker: ReqAck\n"); + return COREDUMP_MARK_REQACK; + case COREDUMP_MARK_MINSIZE: + fprintf(stderr, "Received marker: MinSize\n"); + return COREDUMP_MARK_MINSIZE; + case COREDUMP_MARK_MAXSIZE: + fprintf(stderr, "Received marker: MaxSize\n"); + return COREDUMP_MARK_MAXSIZE; + case COREDUMP_MARK_UNSUPPORTED: + fprintf(stderr, "Received marker: Unsupported\n"); + return COREDUMP_MARK_UNSUPPORTED; + case COREDUMP_MARK_CONFLICTING: + fprintf(stderr, "Received marker: Conflicting\n"); + return COREDUMP_MARK_CONFLICTING; + default: + fprintf(stderr, "Received unknown marker: %u\n", mark); + break; + } + return -1; +} + +bool read_marker(int fd, enum coredump_mark mark) +{ + ssize_t ret; + + ret = recv_marker(fd); + if (ret < 0) + return false; + return ret == mark; +} + +bool read_coredump_req(int fd, struct coredump_req *req) +{ + ssize_t ret; + size_t field_size, user_size, ack_size, kernel_size, remaining_size; + + memset(req, 0, sizeof(*req)); + field_size = sizeof(req->size); + + /* Peek the size of the coredump request. */ + ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); + if (ret != field_size) { + fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n", + ret, field_size); + return false; + } + kernel_size = req->size; + + if (kernel_size < COREDUMP_ACK_SIZE_VER0) { + fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n", + kernel_size, COREDUMP_ACK_SIZE_VER0); + return false; + } + if (kernel_size >= PAGE_SIZE) { + fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n", + kernel_size, PAGE_SIZE); + return false; + } + + /* Use the minimum of user and kernel size to read the full request. */ + user_size = sizeof(struct coredump_req); + ack_size = user_size < kernel_size ? user_size : kernel_size; + ret = recv(fd, req, ack_size, MSG_WAITALL); + if (ret != ack_size) + return false; + + fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", + req->size, (unsigned long long)req->mask); + + if (user_size > kernel_size) + remaining_size = user_size - kernel_size; + else + remaining_size = kernel_size - user_size; + + if (PAGE_SIZE <= remaining_size) + return false; + + /* + * Discard any additional data if the kernel's request was larger than + * what we knew about or cared about. + */ + if (remaining_size) { + char buffer[PAGE_SIZE]; + + ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); + if (ret != remaining_size) + return false; + fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); + } + + return true; +} + +bool send_coredump_ack(int fd, const struct coredump_req *req, + __u64 mask, size_t size_ack) +{ + ssize_t ret; + /* + * Wrap struct coredump_ack in a larger struct so we can + * simulate sending to much data to the kernel. + */ + struct large_ack_for_size_testing { + struct coredump_ack ack; + char buffer[PAGE_SIZE]; + } large_ack = {}; + + if (!size_ack) + size_ack = sizeof(struct coredump_ack) < req->size_ack ? + sizeof(struct coredump_ack) : + req->size_ack; + large_ack.ack.mask = mask; + large_ack.ack.size = size_ack; + ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); + if (ret != size_ack) + return false; + + fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", + size_ack, (unsigned long long)mask); + return true; +} + +bool check_coredump_req(const struct coredump_req *req, size_t min_size, + __u64 required_mask) +{ + if (req->size < min_size) + return false; + if ((req->mask & required_mask) != required_mask) + return false; + if (req->mask & ~required_mask) + return false; + return true; +} + +int open_coredump_tmpfile(int fd_tmpfs_detached) +{ + return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); +} + +void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) +{ + int epfd = -1; + int exit_code = EXIT_FAILURE; + struct epoll_event ev; + int flags; + + /* Set socket to non-blocking mode for edge-triggered epoll */ + flags = fcntl(fd_coredump, F_GETFL, 0); + if (flags < 0) { + fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n"); + goto out; + } + if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) { + fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n"); + goto out; + } + + epfd = epoll_create1(0); + if (epfd < 0) { + fprintf(stderr, "Worker: epoll_create1() failed: %m\n"); + goto out; + } + + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; + ev.data.fd = fd_coredump; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) { + fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n"); + goto out; + } + + for (;;) { + struct epoll_event events[1]; + int n = epoll_wait(epfd, events, 1, -1); + if (n < 0) { + fprintf(stderr, "Worker: epoll_wait() failed: %m\n"); + break; + } + + if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { + for (;;) { + char buffer[4096]; + ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); + if (bytes_read < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; + fprintf(stderr, "Worker: read() failed: %m\n"); + goto out; + } + if (bytes_read == 0) + goto done; + ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); + if (bytes_write != bytes_read) { + if (bytes_write < 0 && errno == ENOSPC) + continue; + fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n", + bytes_read, bytes_write); + goto out; + } + } + } + } + +done: + exit_code = EXIT_SUCCESS; + fprintf(stderr, "Worker: completed successfully\n"); +out: + if (epfd >= 0) + close(epfd); + if (fd_core_file >= 0) + close(fd_core_file); + if (fd_peer_pidfd >= 0) + close(fd_peer_pidfd); + if (fd_coredump >= 0) + close(fd_coredump); + _exit(exit_code); +} diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index a4ac80bb1003..c2e895bcc160 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -23,57 +23,15 @@ #include "../filesystems/wrappers.h" #include "../pidfd/pidfd.h" +#include "coredump_test.h" + #define STACKDUMP_FILE "stack_values" #define STACKDUMP_SCRIPT "stackdump" -#define NUM_THREAD_SPAWN 128 #ifndef PAGE_SIZE #define PAGE_SIZE 4096 #endif -static void *do_nothing(void *) -{ - while (1) - pause(); - - return NULL; -} - -static void crashing_child(void) -{ - pthread_t thread; - int i; - - for (i = 0; i < NUM_THREAD_SPAWN; ++i) - pthread_create(&thread, NULL, do_nothing, NULL); - - /* crash on purpose */ - i = *(int *)NULL; -} - -FIXTURE(coredump) -{ - char original_core_pattern[256]; - pid_t pid_coredump_server; - int fd_tmpfs_detached; -}; - -static int create_detached_tmpfs(void) -{ - int fd_context, fd_tmpfs; - - fd_context = sys_fsopen("tmpfs", 0); - if (fd_context < 0) - return -1; - - if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) - return -1; - - fd_tmpfs = sys_fsmount(fd_context, 0, 0); - close(fd_context); - return fd_tmpfs; -} - FIXTURE_SETUP(coredump) { FILE *file; @@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120) fclose(file); } -static int create_and_listen_unix_socket(const char *path) -{ - struct sockaddr_un addr = { - .sun_family = AF_UNIX, - }; - assert(strlen(path) < sizeof(addr.sun_path) - 1); - strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); - size_t addr_len = - offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1; - int fd, ret; - - fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd < 0) - goto out; - - ret = bind(fd, (const struct sockaddr *)&addr, addr_len); - if (ret < 0) - goto out; - - ret = listen(fd, 128); - if (ret < 0) - goto out; - - return fd; - -out: - if (fd >= 0) - close(fd); - return -1; -} - -static bool set_core_pattern(const char *pattern) -{ - int fd; - ssize_t ret; - - fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC); - if (fd < 0) - return false; - - ret = write(fd, pattern, strlen(pattern)); - close(fd); - if (ret < 0) - return false; - - fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern)); - return ret == strlen(pattern); -} - -static int get_peer_pidfd(int fd) -{ - int fd_peer_pidfd; - socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd); - int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd, - &fd_peer_pidfd_len); - if (ret < 0) { - fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n"); - return -1; - } - return fd_peer_pidfd; -} - -static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info) -{ - memset(info, 0, sizeof(*info)); - info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0; -} - -static void -wait_and_check_coredump_server(pid_t pid_coredump_server, - struct __test_metadata *const _metadata, - FIXTURE_DATA(coredump)* self) -{ - int status; - waitpid(pid_coredump_server, &status, 0); - self->pid_coredump_server = -ESRCH; - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); -} - -TEST_F(coredump, socket) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_detect_userspace_client) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (info.coredump_mask & PIDFD_COREDUMPED) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) { - int fd_socket; - ssize_t ret; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = - offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - fd_socket = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd_socket < 0) - _exit(EXIT_FAILURE); - - ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - _exit(EXIT_FAILURE); - - close(fd_socket); - _exit(EXIT_SUCCESS); - } - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFEXITED(status)); - ASSERT_EQ(WEXITSTATUS(status), 0); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_NE(stat("/tmp/coredump.file", &st), 0); - ASSERT_EQ(errno, ENOENT); -} - -TEST_F(coredump, socket_enoent) -{ - int pidfd, status; - pid_t pid; - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); -} - -TEST_F(coredump, socket_no_listener) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - int ipc_sockets[2]; - char c; - const struct sockaddr_un coredump_sk = { - .sun_family = AF_UNIX, - .sun_path = "/tmp/coredump.socket", - }; - size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) + - sizeof("/tmp/coredump.socket"); - - ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (fd_server < 0) - goto out; - - ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len); - if (ret < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - close(ipc_sockets[1]); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static ssize_t recv_marker(int fd) -{ - enum coredump_mark mark = COREDUMP_MARK_REQACK; - ssize_t ret; - - ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL); - if (ret != sizeof(mark)) - return -1; - - switch (mark) { - case COREDUMP_MARK_REQACK: - fprintf(stderr, "Received marker: ReqAck\n"); - return COREDUMP_MARK_REQACK; - case COREDUMP_MARK_MINSIZE: - fprintf(stderr, "Received marker: MinSize\n"); - return COREDUMP_MARK_MINSIZE; - case COREDUMP_MARK_MAXSIZE: - fprintf(stderr, "Received marker: MaxSize\n"); - return COREDUMP_MARK_MAXSIZE; - case COREDUMP_MARK_UNSUPPORTED: - fprintf(stderr, "Received marker: Unsupported\n"); - return COREDUMP_MARK_UNSUPPORTED; - case COREDUMP_MARK_CONFLICTING: - fprintf(stderr, "Received marker: Conflicting\n"); - return COREDUMP_MARK_CONFLICTING; - default: - fprintf(stderr, "Received unknown marker: %u\n", mark); - break; - } - return -1; -} - -static bool read_marker(int fd, enum coredump_mark mark) -{ - ssize_t ret; - - ret = recv_marker(fd); - if (ret < 0) - return false; - return ret == mark; -} - -static bool read_coredump_req(int fd, struct coredump_req *req) -{ - ssize_t ret; - size_t field_size, user_size, ack_size, kernel_size, remaining_size; - - memset(req, 0, sizeof(*req)); - field_size = sizeof(req->size); - - /* Peek the size of the coredump request. */ - ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL); - if (ret != field_size) - return false; - kernel_size = req->size; - - if (kernel_size < COREDUMP_ACK_SIZE_VER0) - return false; - if (kernel_size >= PAGE_SIZE) - return false; - - /* Use the minimum of user and kernel size to read the full request. */ - user_size = sizeof(struct coredump_req); - ack_size = user_size < kernel_size ? user_size : kernel_size; - ret = recv(fd, req, ack_size, MSG_WAITALL); - if (ret != ack_size) - return false; - - fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n", - req->size, (unsigned long long)req->mask); - - if (user_size > kernel_size) - remaining_size = user_size - kernel_size; - else - remaining_size = kernel_size - user_size; - - if (PAGE_SIZE <= remaining_size) - return false; - - /* - * Discard any additional data if the kernel's request was larger than - * what we knew about or cared about. - */ - if (remaining_size) { - char buffer[PAGE_SIZE]; - - ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL); - if (ret != remaining_size) - return false; - fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size); - } - - return true; -} - -static bool send_coredump_ack(int fd, const struct coredump_req *req, - __u64 mask, size_t size_ack) -{ - ssize_t ret; - /* - * Wrap struct coredump_ack in a larger struct so we can - * simulate sending to much data to the kernel. - */ - struct large_ack_for_size_testing { - struct coredump_ack ack; - char buffer[PAGE_SIZE]; - } large_ack = {}; - - if (!size_ack) - size_ack = sizeof(struct coredump_ack) < req->size_ack ? - sizeof(struct coredump_ack) : - req->size_ack; - large_ack.ack.mask = mask; - large_ack.ack.size = size_ack; - ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL); - if (ret != size_ack) - return false; - - fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n", - size_ack, (unsigned long long)mask); - return true; -} - -static bool check_coredump_req(const struct coredump_req *req, size_t min_size, - __u64 required_mask) -{ - if (req->size < min_size) - return false; - if ((req->mask & required_mask) != required_mask) - return false; - if (req->mask & ~required_mask) - return false; - return true; -} - -TEST_F(coredump, socket_request_kernel) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct stat st; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - fd_core_file = creat("/tmp/coredump.file", 0644); - if (fd_core_file < 0) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) - goto out; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); - - ASSERT_EQ(stat("/tmp/coredump.file", &st), 0); - ASSERT_GT(st.st_size, 0); - system("file /tmp/coredump.file"); -} - -TEST_F(coredump, socket_request_userspace) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_TRUE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_reject) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - - for (;;) { - char buffer[4096]; - ssize_t bytes_read; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read > 0) - goto out; - - if (bytes_read < 0) - goto out; - - if (bytes_read == 0) - break; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_flag_combination) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_unknown_flag) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_small) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 / 2)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_request_invalid_size_large) -{ - int pidfd, ret, status; - pid_t pid, pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); - ASSERT_EQ(ret, 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - struct coredump_req req = {}; - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1; - int exit_code = EXIT_FAILURE; - - close(ipc_sockets[0]); - - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - - close(ipc_sockets[1]); - - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) - goto out; - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - - if (!(info.mask & PIDFD_INFO_COREDUMP)) - goto out; - - if (!(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - - if (!read_coredump_req(fd_coredump, &req)) - goto out; - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_REJECT | COREDUMP_WAIT, - COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) - goto out; - - if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) - goto out; - - exit_code = EXIT_SUCCESS; -out: - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - pid = fork(); - ASSERT_GE(pid, 0); - if (pid == 0) - crashing_child(); - - pidfd = sys_pidfd_open(pid, 0); - ASSERT_GE(pidfd, 0); - - waitpid(pid, &status, 0); - ASSERT_TRUE(WIFSIGNALED(status)); - ASSERT_FALSE(WCOREDUMP(status)); - - ASSERT_TRUE(get_pidfd_info(pidfd, &info)); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -static int open_coredump_tmpfile(int fd_tmpfs_detached) -{ - return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); -} - -#define NUM_CRASHING_COREDUMPS 5 - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - int exit_code = EXIT_FAILURE; - struct coredump_req req = {}; - - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) { - fprintf(stderr, "Failed to create and listen on unix socket\n"); - goto out; - } - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) { - fprintf(stderr, "Failed to notify parent via ipc socket\n"); - goto out; - } - close(ipc_sockets[1]); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - fprintf(stderr, "accept4 failed: %m\n"); - goto out; - } - - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) { - fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (!get_pidfd_info(fd_peer_pidfd, &info)) { - fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!(info.mask & PIDFD_INFO_COREDUMP)) { - fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd); - goto out; - } - if (!(info.coredump_mask & PIDFD_COREDUMPED)) { - fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd); - goto out; - } - - if (!read_coredump_req(fd_coredump, &req)) { - fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) { - fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump); - goto out; - } - - if (!send_coredump_ack(fd_coredump, &req, - COREDUMP_KERNEL | COREDUMP_WAIT, 0)) { - fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump); - goto out; - } - - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) { - fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump); - goto out; - } - - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) { - fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump); - goto out; - } - - for (;;) { - char buffer[4096]; - ssize_t bytes_read, bytes_write; - - bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump); - goto out; - } - - if (bytes_read == 0) - break; - - bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_read != bytes_write) { - fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file); - goto out; - } - } - - close(fd_core_file); - close(fd_peer_pidfd); - close(fd_coredump); - fd_peer_pidfd = -1; - fd_coredump = -1; - } - - exit_code = EXIT_SUCCESS; -out: - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_server >= 0) - close(fd_server); - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - waitpid(pid[i], &status[i], 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -#define MAX_EVENTS 128 - -static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file) -{ - int epfd = -1; - int exit_code = EXIT_FAILURE; - - epfd = epoll_create1(0); - if (epfd < 0) - goto out; - - struct epoll_event ev; - ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; - ev.data.fd = fd_coredump; - if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) - goto out; - - for (;;) { - struct epoll_event events[1]; - int n = epoll_wait(epfd, events, 1, -1); - if (n < 0) - break; - - if (events[0].events & (EPOLLIN | EPOLLRDHUP)) { - for (;;) { - char buffer[4096]; - ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer)); - if (bytes_read < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - break; - goto out; - } - if (bytes_read == 0) - goto done; - ssize_t bytes_write = write(fd_core_file, buffer, bytes_read); - if (bytes_write != bytes_read) - goto out; - } - } - } - -done: - exit_code = EXIT_SUCCESS; -out: - if (epfd >= 0) - close(epfd); - if (fd_core_file >= 0) - close(fd_core_file); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_coredump >= 0) - close(fd_coredump); - _exit(exit_code); -} - -TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500) -{ - int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS]; - pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS]; - struct pidfd_info info = {}; - int ipc_sockets[2]; - char c; - - ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket")); - ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); - - pid_coredump_server = fork(); - ASSERT_GE(pid_coredump_server, 0); - if (pid_coredump_server == 0) { - int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0; - fd_server = -1; - exit_code = EXIT_FAILURE; - n_conns = 0; - close(ipc_sockets[0]); - fd_server = create_and_listen_unix_socket("/tmp/coredump.socket"); - if (fd_server < 0) - goto out; - - if (write_nointr(ipc_sockets[1], "1", 1) < 0) - goto out; - close(ipc_sockets[1]); - - while (n_conns < NUM_CRASHING_COREDUMPS) { - int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1; - struct coredump_req req = {}; - fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC); - if (fd_coredump < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - continue; - goto out; - } - fd_peer_pidfd = get_peer_pidfd(fd_coredump); - if (fd_peer_pidfd < 0) - goto out; - if (!get_pidfd_info(fd_peer_pidfd, &info)) - goto out; - if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) - goto out; - if (!read_coredump_req(fd_coredump, &req)) - goto out; - if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0, - COREDUMP_KERNEL | COREDUMP_USERSPACE | - COREDUMP_REJECT | COREDUMP_WAIT)) - goto out; - if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) - goto out; - if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) - goto out; - fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached); - if (fd_core_file < 0) - goto out; - pid_t worker = fork(); - if (worker == 0) { - close(fd_server); - process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file); - } - worker_pids[n_conns] = worker; - if (fd_coredump >= 0) - close(fd_coredump); - if (fd_peer_pidfd >= 0) - close(fd_peer_pidfd); - if (fd_core_file >= 0) - close(fd_core_file); - n_conns++; - } - exit_code = EXIT_SUCCESS; -out: - if (fd_server >= 0) - close(fd_server); - - // Reap all worker processes - for (int i = 0; i < n_conns; i++) { - int wstatus; - if (waitpid(worker_pids[i], &wstatus, 0) < 0) { - fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]); - } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) { - fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus)); - exit_code = EXIT_FAILURE; - } - } - - _exit(exit_code); - } - self->pid_coredump_server = pid_coredump_server; - - EXPECT_EQ(close(ipc_sockets[1]), 0); - ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); - EXPECT_EQ(close(ipc_sockets[0]), 0); - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - pid[i] = fork(); - ASSERT_GE(pid[i], 0); - if (pid[i] == 0) - crashing_child(); - pidfd[i] = sys_pidfd_open(pid[i], 0); - ASSERT_GE(pidfd[i], 0); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - ASSERT_GE(waitpid(pid[i], &status[i], 0), 0); - ASSERT_TRUE(WIFSIGNALED(status[i])); - ASSERT_TRUE(WCOREDUMP(status[i])); - } - - for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) { - info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP; - ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0); - ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0); - ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0); - } - - wait_and_check_coredump_server(pid_coredump_server, _metadata, self); -} - -TEST_F(coredump, socket_invalid_paths) -{ - ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@..")); - - ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@../coredump.socket")); - ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/..")); - ASSERT_FALSE(set_core_pattern("@@..")); - - ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket")); -} - TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 6e41635bd55a..71ee69e524d7 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -18,6 +18,7 @@ TEST_PROGS := \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ netcons_sysdata.sh \ + netcons_torture.sh \ netpoll_basic.py \ ping.py \ psp.py \ diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 402d4ee84f2e..6c5c60adb5e8 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -14,6 +14,7 @@ TEST_PROGS := \ dev_addr_lists.sh \ mode-1-recovery-updelay.sh \ mode-2-recovery-updelay.sh \ + netcons_over_bonding.sh \ # end of TEST_PROGS TEST_FILES := \ @@ -24,6 +25,7 @@ TEST_FILES := \ TEST_INCLUDES := \ ../../../net/lib.sh \ + ../lib/sh/lib_netcons.sh \ ../../../net/forwarding/lib.sh \ # end of TEST_INCLUDES diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 6bb290abd48b..991494376223 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -1,5 +1,6 @@ CONFIG_BONDING=y CONFIG_BRIDGE=y +CONFIG_CONFIGFS_FS=y CONFIG_DUMMY=y CONFIG_INET_ESP=y CONFIG_INET_ESP_OFFLOAD=y @@ -9,6 +10,9 @@ CONFIG_MACVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y CONFIG_NET_CLS_MATCHALL=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETCONSOLE_EXTENDED_LOG=y CONFIG_NETDEVSIM=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh new file mode 100755 index 000000000000..477cc9379500 --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh @@ -0,0 +1,361 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 +# +# This selftest exercises trying to have multiple netpoll users at the same +# time. +# +# This selftest has multiple smalls test inside, and the goal is to +# get interfaces with bonding and netconsole in different orders in order +# to catch any possible issue. +# +# The main test composes of four interfaces being created using netdevsim; two +# of them are bonded to serve as the netconsole's transmit interface. The +# remaining two interfaces are similarly bonded and assigned to a separate +# network namespace, which acts as the receive interface, where socat monitors +# for incoming messages. +# +# A netconsole message is then sent to ensure it is properly received across +# this configuration. +# +# Later, run a few other tests, to make sure that bonding and netconsole +# cannot coexist. +# +# The test's objective is to exercise netpoll usage when managed simultaneously +# by multiple subsystems (netconsole and bonding). +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true +modprobe bonding 2> /dev/null || true +modprobe veth 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup_bond EXIT + +FORMAT="extended" +IP_VERSION="ipv4" +VETH0="veth"$(( RANDOM % 256)) +VETH1="veth"$((256 + RANDOM % 256)) +TXNS="" +RXNS="" + +# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with +# the bonding interfaces +function setup_bonding_ifaces() { + local RAND=$(( RANDOM % 100 )) + BOND_TX_MAIN_IF="bond_tx_$RAND" + BOND_RX_MAIN_IF="bond_rx_$RAND" + + # Setup TX + if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + then + echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2 + # only clean nsim ifaces and namespace. Nothing else has been + # initialized + cleanup_bond_nsim + trap - EXIT + exit "${ksft_skip}" + fi + + # create_netdevsim() got the interface up, but it needs to be down + # before being enslaved. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # Setup RX + ip -n "${RXNS}" \ + link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" down + ip -n "${RXNS}" \ + link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}" + ip -n "${RXNS}" \ + link set "${BOND_RX_MAIN_IF}" up + + export DSTIF="${BOND_RX_MAIN_IF}" + export SRCIF="${BOND_TX_MAIN_IF}" +} + +# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface +# and the other two will be bond to the RX interface (on the other namespace) +function create_ifaces_bond() { + BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}") + BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}") + BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}") + BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}") +} + +# netdevsim link BOND_TX to BOND_RX interfaces +function link_ifaces_bond() { + local BOND_TX1_SLAVE_IFIDX + local BOND_TX2_SLAVE_IFIDX + local BOND_RX1_SLAVE_IFIDX + local BOND_RX2_SLAVE_IFIDX + local TXNS_FD + local RXNS_FD + + BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex) + BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \ + cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex) + BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex) + BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \ + cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex) + + exec {TXNS_FD}</var/run/netns/"${TXNS}" + exec {RXNS_FD}</var/run/netns/"${RXNS}" + + # Linking TX ifaces to the RX ones (on the other namespace) + echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \ + > "$NSIM_DEV_SYS_LINK" + + exec {TXNS_FD}<&- + exec {RXNS_FD}<&- +} + +function create_all_ifaces() { + # setup_ns function is coming from lib.sh + setup_ns TXNS RXNS + export NAMESPACE="${RXNS}" + + # Create two interfaces for RX and two for TX + create_ifaces_bond + # Link netlink ifaces + link_ifaces_bond +} + +# configure DSTIF and SRCIF IPs +function configure_ifaces_ips() { + local IP_VERSION=${1:-"ipv4"} + select_ipv4_or_ipv6 "${IP_VERSION}" + + ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}" + ip -n "${RXNS}" link set "${DSTIF}" up + + ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}" + ip -n "${TXNS}" link set "${SRCIF}" up +} + +function test_enable_netpoll_on_enslaved_iface() { + echo 0 > "${NETCONS_PATH}"/enabled + + # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and + # linked to BOND_RX1_SLAVE_IF inside the namespace. + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # This should fail with the following message in dmesg: + # netpoll: netconsole: ethX is a slave device, aborting + set +e + enable_netcons_ns 2> /dev/null + set -e + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Bonding and netpoll cannot co-exists." >&2 + exit "${ksft_fail}" + fi +} + +function test_delete_bond_and_reenable_target() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond + + # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore + # netpoll can be plugged in there + echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name + + # this should work, since the interface is not enslaved + enable_netcons_ns + + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Unable to start netpoll on an unbond iface." >&2 + exit "${ksft_fail}" + fi +} + +# Send a netconsole message to the netconsole target +function test_send_netcons_msg_through_bond_iface() { + # Listen for netconsole port inside the namespace and + # destination interface + listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" & + # Wait for socat to start and listen to the port. + wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}" + # Send the message + echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" + # Make sure the message was received in the dst part + # and exit + validate_result "${OUTPUT_FILE}" "${FORMAT}" + # kill socat in case it is still running + pkill_socat +} + +# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF. +# Given BOND_TX_MAIN_IF was deleted, recreate it first +function test_enslave_netcons_enabled_iface { + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2 + exit "${ksft_fail}" + fi + + # recreate the bonding iface. it got deleted by previous + # test (test_delete_bond_and_reenable_target) + ip -n "${TXNS}" \ + link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr + + # sub-interface need to be down before attaching to bonding + # This will also disable netconsole. + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" down + ip -n "${TXNS}" \ + link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + ip -n "${TXNS}" \ + link set "${BOND_TX_MAIN_IF}" up + + # netconsole got disabled while the interface was down + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]] + then + echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2 + exit "${ksft_fail}" + fi +} + +# Get netconsole enabled on a bonding interface and attach a second +# sub-interface. +function test_enslave_iface_to_bond { + # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now + echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name + enable_netcons_ns + + # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is + # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF. + ip -n "${TXNS}" \ + link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}" + if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]] + then + echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2 + exit "${ksft_fail}" + fi +} + +function test_enslave_iff_disabled_netpoll_iface { + local ret + + # Create two interfaces. veth interfaces it known to have + # IFF_DISABLE_NETPOLL set + if ! ip link add "${VETH0}" type veth peer name "${VETH1}" + then + echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2 + exit "${ksft_skip}" + fi + set +e + # This will print RTNETLINK answers: Device or resource busy + ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null + ret=$? + set -e + if [[ $ret -eq 0 ]] + then + echo "test failed: veth interface could not be enslaved" + exit "${ksft_fail}" + fi +} + +# Given that netconsole picks the current net namespace, we need to enable it +# from inside the TXNS namespace +function enable_netcons_ns() { + ip netns exec "${TXNS}" sh -c \ + "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled" +} + +#################### +# Tests start here # +#################### + +# Create regular interfaces using netdevsim and link them +create_all_ifaces + +# Setup the bonding interfaces +# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF +# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF +setup_bonding_ifaces + +# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF +configure_ifaces_ips "${IP_VERSION}" + +_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}" +enable_netcons_ns +set_user_data + +# Test #1 : Create an bonding interface and attach netpoll into +# the bonding interface. Netconsole/netpoll should work on +# the bonding interface. +test_send_netcons_msg_through_bond_iface +echo "test #1: netpoll on bonding interface worked. Test passed" >&2 + +# Test #2: Attach netpoll to an enslaved interface +# Try to attach netpoll to an enslaved sub-interface (while still being part of +# a bonding interface), which shouldn't be allowed +test_enable_netpoll_on_enslaved_iface +echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2 + +# Test #3: Unplug the sub-interface from bond and enable netconsole +# Detach the interface from a bonding interface and attach netpoll again +test_delete_bond_and_reenable_target +echo "test #3: Able to attach to an unbound interface. Test passed." >&2 + +# Test #4: Enslave a sub-interface that had netconsole enabled +# Try to enslave an interface that has netconsole/netpoll enabled. +# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it +test_enslave_netcons_enabled_iface +echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2 + +# Test #5: Enslave a sub-interface to a bonding interface +# Enslave an interface to a bond interface that has netpoll attached +# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of +# it. Netconsole is currently disabled +test_enslave_iface_to_bond +echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2 + +# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface +# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is +# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface +# and it should fail, with netpoll: veth0 doesn't support polling +test_enslave_iff_disabled_netpoll_iface +echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2 + +cleanup_bond +trap - EXIT +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 8e1085e89647..87f89fd92f8c 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -11,9 +11,11 @@ set -euo pipefail LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") SRCIF="" # to be populated later +SRCIP="" # to be populated later SRCIP4="192.0.2.1" SRCIP6="fc00::1" DSTIF="" # to be populated later +DSTIP="" # to be populated later DSTIP4="192.0.2.2" DSTIP6="fc00::2" @@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}" # NAMESPACE will be populated by setup_ns with a random value NAMESPACE="" -# IDs for netdevsim +# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test +# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the +# same time. NSIM_DEV_1_ID=$((256 + RANDOM % 256)) NSIM_DEV_2_ID=$((512 + RANDOM % 256)) +NSIM_BOND_TX_1=$((768 + RANDOM % 256)) +NSIM_BOND_TX_2=$((1024 + RANDOM % 256)) +NSIM_BOND_RX_1=$((1280 + RANDOM % 256)) +NSIM_BOND_RX_2=$((1536 + RANDOM % 256)) NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device" +NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device" # Used to create and delete namespaces source "${LIBDIR}"/../../../../net/lib.sh # Create netdevsim interfaces create_ifaces() { - echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW" echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW" udevadm settle 2> /dev/null || true @@ -113,31 +121,38 @@ function set_network() { configure_ip } -function create_dynamic_target() { - local FORMAT=${1:-"extended"} +function _create_dynamic_target() { + local FORMAT="${1:?FORMAT parameter required}" + local NCPATH="${2:?NCPATH parameter required}" DSTMAC=$(ip netns exec "${NAMESPACE}" \ ip link show "${DSTIF}" | awk '/ether/ {print $2}') # Create a dynamic target - mkdir "${NETCONS_PATH}" + mkdir "${NCPATH}" - echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip - echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip - echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac - echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name + echo "${DSTIP}" > "${NCPATH}"/remote_ip + echo "${SRCIP}" > "${NCPATH}"/local_ip + echo "${DSTMAC}" > "${NCPATH}"/remote_mac + echo "${SRCIF}" > "${NCPATH}"/dev_name if [ "${FORMAT}" == "basic" ] then # Basic target does not support release - echo 0 > "${NETCONS_PATH}"/release - echo 0 > "${NETCONS_PATH}"/extended + echo 0 > "${NCPATH}"/release + echo 0 > "${NCPATH}"/extended elif [ "${FORMAT}" == "extended" ] then - echo 1 > "${NETCONS_PATH}"/extended + echo 1 > "${NCPATH}"/extended fi +} - echo 1 > "${NETCONS_PATH}"/enabled +function create_dynamic_target() { + local FORMAT=${1:-"extended"} + local NCPATH=${2:-"$NETCONS_PATH"} + _create_dynamic_target "${FORMAT}" "${NCPATH}" + + echo 1 > "${NCPATH}"/enabled # This will make sure that the kernel was able to # load the netconsole driver configuration. The console message @@ -185,14 +200,26 @@ function do_cleanup() { echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk } -function cleanup() { +function cleanup_netcons() { # delete netconsole dynamic reconfiguration - echo 0 > "${NETCONS_PATH}"/enabled + # do not fail if the target is already disabled + if [[ ! -d "${NETCONS_PATH}" ]] + then + # in some cases this is called before netcons path is created + return + fi + if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]] + then + echo 0 > "${NETCONS_PATH}"/enabled || true + fi # Remove all the keys that got created during the selftest find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete # Remove the configfs entry rmdir "${NETCONS_PATH}" +} +function cleanup() { + cleanup_netcons do_cleanup } @@ -369,3 +396,24 @@ function wait_for_port() { # more frequently on IPv6 sleep 1 } + +# Clean up netdevsim ifaces created for bonding test +function cleanup_bond_nsim() { + ip -n "${TXNS}" \ + link delete "${BOND_TX_MAIN_IF}" type bond || true + ip -n "${RXNS}" \ + link delete "${BOND_RX_MAIN_IF}" type bond || true + + cleanup_netdevsim "$NSIM_BOND_TX_1" + cleanup_netdevsim "$NSIM_BOND_TX_2" + cleanup_netdevsim "$NSIM_BOND_RX_1" + cleanup_netdevsim "$NSIM_BOND_RX_2" +} + +# cleanup tests that use bonding interfaces +function cleanup_bond() { + cleanup_netcons + cleanup_bond_nsim + cleanup_all_ns + ip link delete "${VETH0}" || true +} diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh new file mode 100755 index 000000000000..2ce9ee3719d1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_torture.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Repeatedly send kernel messages, toggles netconsole targets on and off, +# creates and deletes targets in parallel, and toggles the source interface to +# simulate stress conditions. +# +# This test aims to verify the robustness of netconsole under dynamic +# configurations and concurrent operations. +# +# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make +# sure no issues is reported. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Number of times the main loop run +ITERATIONS=${1:-150} + +# Only test extended format +FORMAT="extended" +# And ipv6 only +IP_VERSION="ipv6" + +# Create, enable and delete some targets. +create_and_delete_random_target() { + COUNT=2 + RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_) + + if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \ + [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then + echo "Function didn't finish yet, skipping it." >&2 + return + fi + + # enable COUNT targets + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + + # Basic population so the target can come up + _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}" + done + + echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg + # disable them all + for i in $(seq ${COUNT}) + do + RND_TARGET="${RND_PREFIX}"${i} + RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}" + if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]] + then + echo 0 > "${RND_TARGET_PATH}"/enabled + fi + rmdir "${RND_TARGET_PATH}" + done +} + +# Disable and enable the target mid-air, while messages +# are being transmitted. +toggle_netcons_target() { + for i in $(seq 2) + do + if [ ! -d "${NETCONS_PATH}" ] + then + break + fi + echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + # Try to enable a bit harder, given it might fail to enable + # Write to `enabled` might fail depending on the lock, which is + # highly contentious here + for _ in $(seq 5) + do + echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true + done + done +} + +toggle_iface(){ + ip link set "${SRCIF}" down + ip link set "${SRCIF}" up +} + +# Start here + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network "${IP_VERSION}" +# Create a dynamic target for netconsole +create_dynamic_target "${FORMAT}" + +for i in $(seq "$ITERATIONS") +do + for _ in $(seq 10) + do + echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg + done + wait + + if (( i % 30 == 0 )); then + toggle_netcons_target & + fi + + if (( i % 50 == 0 )); then + # create some targets, enable them, send msg and disable + # all in a parallel thread + create_and_delete_random_target & + fi + + if (( i % 70 == 0 )); then + toggle_iface & + fi +done +wait + +exit "${EXIT_STATUS}" diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile index daf51113c827..df10c7243511 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/Makefile +++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile @@ -20,4 +20,8 @@ TEST_PROGS := \ udp_tunnel_nic.sh \ # end of TEST_PROGS +TEST_FILES := \ + ethtool-common.sh +# end of TEST_FILES + include ../../../lib.mk diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c43a69dffd83..a0c64f415a7f 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -487,7 +487,7 @@ int setup_userns(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); if (ret) { ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index c62165fabd0c..cfa16aa1f39a 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -20,6 +20,10 @@ sample_events() { echo 0 > tracing_on echo 0 > events/enable +# Clear functions caused by page cache; run sample_events twice +sample_events +sample_events + echo "Get the most frequently calling function" echo > trace sample_events diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 3eebf5e3b974..bb4d33dde3c8 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2638,6 +2638,8 @@ TEST_F(vfio_compat_mock_domain, map) ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); + /* Unmap of empty is success */ + ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); /* UNMAP_FLAG_ALL requires 0 iova/size */ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 772ca1db6e59..9f472c20c190 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -1044,8 +1044,8 @@ static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) }; while (nvevents--) { - if (!ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), - &trigger_vevent_cmd)) + if (ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), + &trigger_vevent_cmd)) return -1; } return 0; diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index c9b84eeaab6b..0a3a94c4cca1 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -63,11 +63,13 @@ static struct feature_id_reg feat_id_regs[] = { REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2), REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP), + REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP), REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP), REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY), REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP), REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP), + REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP), }; bool filter_reg(__u64 reg) @@ -718,6 +720,7 @@ static __u64 el2_regs[] = { SYS_REG(VMPIDR_EL2), SYS_REG(SCTLR_EL2), SYS_REG(ACTLR_EL2), + SYS_REG(SCTLR2_EL2), SYS_REG(HCR_EL2), SYS_REG(MDCR_EL2), SYS_REG(CPTR_EL2), diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..0e2f8ed90f30 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -15,6 +15,8 @@ #include "gic_v3.h" #include "processor.h" +#define GITS_COLLECTION_TARGET_SHIFT 16 + static u64 its_read_u64(unsigned long offset) { return readq_relaxed(GITS_BASE_GVA + offset); @@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col) its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); } +static u64 procnum_to_rdbase(u32 vcpu_id) +{ + return vcpu_id << GITS_COLLECTION_TARGET_SHIFT; +} + #define GITS_CMDQ_POLL_ITERATIONS 0 static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) @@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val its_encode_cmd(&cmd, GITS_CMD_MAPC); its_encode_collection(&cmd, collection_id); - its_encode_target(&cmd, vcpu_id); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); its_encode_valid(&cmd, valid); its_send_cmd(cmdq_base, &cmd); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 9e3be2ee7f1b..f917b4c4c943 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1758,10 +1758,15 @@ int main(int argc, char *argv[]) uffd_test_ops = mem_type->mem_ops; uffd_test_case_ops = test->test_case_ops; - if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) { gopts.page_size = default_huge_page_size(); - else + if (gopts.page_size == 0) { + uffd_test_skip("huge page size is 0, feature missing?"); + continue; + } + } else { gopts.page_size = psize(); + } /* Ensure we have at least 2 pages */ gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2) @@ -1776,12 +1781,6 @@ int main(int argc, char *argv[]) continue; uffd_test_start("%s on %s", test->name, mem_type->name); - if ((mem_type->mem_flag == MEM_HUGETLB || - mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && - (default_huge_page_size() == 0)) { - uffd_test_skip("huge page size is 0, feature missing?"); - continue; - } if (!uffd_feature_supported(test)) { uffd_test_skip("feature missing"); continue; diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index ccfb40837a73..0989e80da457 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,3 +1,12 @@ nsid_test file_handle_test init_ino_test +ns_active_ref_test +listns_test +listns_permissions_test +listns_efault_test +siocgskns_test +cred_change_test +stress_test +listns_pagination_bug +regression_pidfd_setns_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 5fe4b3dc07d3..fbb821652c17 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,29 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +LDLIBS += -lcap -TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test +TEST_GEN_PROGS := nsid_test \ + file_handle_test \ + init_ino_test \ + ns_active_ref_test \ + listns_test \ + listns_permissions_test \ + listns_efault_test \ + siocgskns_test \ + cred_change_test \ + stress_test \ + listns_pagination_bug \ + regression_pidfd_setns_test include ../lib.mk +$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c +$(OUTPUT)/listns_test: ../filesystems/utils.c +$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c +$(OUTPUT)/listns_efault_test: ../filesystems/utils.c +$(OUTPUT)/siocgskns_test: ../filesystems/utils.c +$(OUTPUT)/cred_change_test: ../filesystems/utils.c +$(OUTPUT)/stress_test: ../filesystems/utils.c +$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c +$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c + diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c new file mode 100644 index 000000000000..7b4f5ad3f725 --- /dev/null +++ b/tools/testing/selftests/namespaces/cred_change_test.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test credential changes and their impact on namespace active references. + */ + +/* + * Test setuid() in a user namespace properly swaps active references. + * Create a user namespace with multiple UIDs mapped, then setuid() between them. + * Verify that the user namespace remains active throughout. + */ +TEST(setuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setuid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple UIDs mapped (0-9) */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform multiple setuid() calls. + * Each setuid() triggers commit_creds() which should properly + * swap active references via switch_cred_namespaces(). + */ + for (setuid_count = 0; setuid_count < 50; setuid_count++) { + uid_t target_uid = (setuid_count % 10); + if (setuid(target_uid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); + + /* Verify namespace is active while child is running */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + ASSERT_TRUE(found); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive after child exits */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setuid() correctly preserved active references (no leak)"); +} + +/* + * Test setgid() in a user namespace properly handles active references. + */ +TEST(setgid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setgid_count; + + close(pipefd[0]); + + /* Create new user namespace with multiple GIDs mapped */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setgid() calls */ + for (setgid_count = 0; setgid_count < 50; setgid_count++) { + gid_t target_gid = (setgid_count % 10); + if (setgid(target_gid) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setgid() correctly preserved active references (no leak)"); +} + +/* + * Test setresuid() which changes real, effective, and saved UIDs. + * This should properly swap active references via commit_creds(). + */ +TEST(setresuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int setres_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setresuid() calls */ + for (setres_count = 0; setres_count < 30; setres_count++) { + uid_t uid1 = (setres_count % 5); + uid_t uid2 = ((setres_count + 1) % 5); + uid_t uid3 = ((setres_count + 2) % 5); + + if (setresuid(uid1, uid2, uid3) < 0) { + if (errno != EPERM) { + close(pipefd[1]); + exit(1); + } + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setresuid() correctly preserved active references (no leak)"); +} + +/* + * Test credential changes across multiple user namespaces. + * Create nested user namespaces and verify active reference tracking. + */ +TEST(cred_change_nested_userns) +{ + pid_t pid; + int status; + __u64 parent_userns_id, child_userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found_parent = false, found_child = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 parent_id, child_id; + uid_t orig_uid = getuid(); + + close(pipefd[0]); + + /* Create first user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get first namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create nested user namespace */ + userns_fd = get_userns_fd(0, 0, 1); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get nested namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + + /* Perform some credential changes in nested namespace */ + setuid(0); + setgid(0); + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read both namespace IDs */ + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get parent namespace ID"); + } + + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get child namespace ID"); + } + close(pipefd[0]); + + TH_LOG("Parent userns: %llu, Child userns: %llu", + (unsigned long long)parent_userns_id, + (unsigned long long)child_userns_id); + + /* Verify both namespaces are active */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_TRUE(found_parent); + ASSERT_TRUE(found_child); + + /* Wait for child */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify both namespaces become inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_parent = false; + found_child = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == parent_userns_id) + found_parent = true; + if (ns_ids[i] == child_userns_id) + found_child = true; + } + + ASSERT_FALSE(found_parent); + ASSERT_FALSE(found_child); + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); +} + +/* + * Test rapid credential changes don't cause refcount imbalances. + * This stress-tests the switch_cred_namespaces() logic. + */ +TEST(rapid_cred_changes_no_leak) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace with wider range of UIDs/GIDs */ + userns_fd = get_userns_fd(0, orig_uid, 100); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* + * Perform many rapid credential changes. + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. + */ + for (change_count = 0; change_count < 200; change_count++) { + switch (change_count % 6) { + case 0: + setuid(change_count % 50); + break; + case 1: + setgid(change_count % 50); + break; + case 2: + setreuid(change_count % 50, (change_count + 1) % 50); + break; + case 3: + setregid(change_count % 50, (change_count + 1) % 50); + break; + case 4: + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + case 5: + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); + break; + } + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive (no leaked active refs) */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("200 rapid credential changes completed with no active ref leak"); +} + +/* + * Test setfsuid/setfsgid which change filesystem UID/GID. + * These also trigger credential changes but may have different code paths. + */ +TEST(setfsuid_preserves_active_refs) +{ + pid_t pid; + int status; + __u64 userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + ssize_t ret; + int i; + bool found = false; + int pipefd[2]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + int fd, userns_fd; + __u64 child_userns_id; + uid_t orig_uid = getuid(); + int change_count; + + close(pipefd[0]); + + /* Create new user namespace */ + userns_fd = get_userns_fd(0, orig_uid, 10); + if (userns_fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + close(pipefd[1]); + exit(1); + } + close(userns_fd); + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); + + /* Perform multiple setfsuid/setfsgid calls */ + for (change_count = 0; change_count < 50; change_count++) { + setfsuid(change_count % 10); + setfsgid(change_count % 10); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { + close(pipefd[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace ID from child"); + } + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Verify namespace becomes inactive */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == userns_id) { + found = true; + break; + } + } + + ASSERT_FALSE(found); + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c new file mode 100644 index 000000000000..c7ed4023d7a8 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_efault_test.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "../pidfd/pidfd.h" +#include "wrappers.h" + +/* + * Test listns() error handling with invalid buffer addresses. + * + * When the buffer pointer is invalid (e.g., crossing page boundaries + * into unmapped memory), listns() returns EINVAL. + * + * This test also creates mount namespaces that get destroyed during + * iteration, testing that namespace cleanup happens outside the RCU + * read lock. + */ +TEST(listns_partial_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[5]; + int sv[5][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* + * Map two pages: + * - First page: readable and writable + * - Second page: will be unmapped to trigger EFAULT + */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position the buffer pointer so there's room for exactly one u64 + * before the page boundary. The second u64 would fall into the + * unmapped page. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; + + /* + * Create a separate process to run listns() in a loop concurrently + * with namespace creation and destruction. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * The kernel should: + * 1. Successfully write the first namespace ID (within valid page) + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) + * 3. Handle concurrent namespace destruction without deadlock + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 2, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create several child processes, each in its own mount namespace. + * These will be destroyed while the iterator is running listns(). + */ + for (i = 0; i < 5; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create a couple of tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 5; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* + * Signal children to exit. This will destroy their mount namespaces + * while listns() is iterating the namespace tree. + * This tests that cleanup happens outside the RCU read lock. + */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all mount namespace children to exit and cleanup */ + for (i = 0; i < 5; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test listns() error handling when the entire buffer is invalid. + * This is a sanity check that basic invalid pointer detection works. + */ +TEST(listns_complete_fault) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 *ns_ids; + ssize_t ret; + + /* Use a clearly invalid pointer */ + ns_ids = (__u64 *)0xdeadbeef; + + ret = sys_listns(&req, ns_ids, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() error handling when the buffer is NULL. + */ +TEST(listns_null_buffer) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + ssize_t ret; + + /* NULL buffer with non-zero count should fail */ + ret = sys_listns(&req, NULL, 10, 0); + + if (ret == -1 && errno == ENOSYS) + SKIP(return, "listns() not supported"); + + /* Should fail with EFAULT */ + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EFAULT); +} + +/* + * Test listns() with a buffer that becomes invalid mid-iteration + * (after several successful writes), combined with mount namespace + * destruction to test RCU cleanup logic. + */ +TEST(listns_late_fault_with_ns_cleanup) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[10]; + int sv[10][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Map two pages */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Unmap the second page */ + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* + * Position buffer so we can write several u64s successfully + * before hitting the page boundary. + */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Request 10 namespace IDs while namespaces are being destroyed. + * This tests: + * 1. EFAULT handling when buffer becomes invalid + * 2. Namespace cleanup outside RCU read lock during iteration + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* + * Create more children with mount namespaces to increase the + * likelihood that namespace cleanup happens during iteration. + */ + for (i = 0; i < 10; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Child: create tmpfs mounts */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 10; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill half the children */ + for (i = 0; i < 5; i++) + write_nointr(sv[i][0], "X", 1); + + /* Small delay to let some exit */ + usleep(10000); + + /* Kill remaining children */ + for (i = 5; i < 10; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for all children and cleanup */ + for (i = 0; i < 10; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + /* Clean up */ + munmap(map, page_size); +} + +/* + * Test specifically focused on mount namespace cleanup during EFAULT. + * Filter for mount namespaces only. + */ +TEST(listns_mnt_ns_cleanup_on_fault) +{ + void *map; + __u64 *ns_ids; + ssize_t ret; + long page_size; + pid_t pid, iter_pid; + int pidfds[8]; + int sv[8][2]; + int iter_pidfd; + int i, status; + char c; + + page_size = sysconf(_SC_PAGESIZE); + ASSERT_GT(page_size, 0); + + /* Set up partial fault buffer */ + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + ret = munmap((char *)map + page_size, page_size); + ASSERT_EQ(ret, 0); + + /* Position for 3 successful writes, then fault */ + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; + + /* + * Create a separate process to run listns() concurrently. + */ + iter_pid = create_child(&iter_pidfd, 0); + ASSERT_NE(iter_pid, -1); + + if (iter_pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + int iter_ret; + + /* + * Loop calling listns() until killed. + * Call listns() to race with namespace destruction. + */ + while (1) { + iter_ret = sys_listns(&req, ns_ids, 10, 0); + + if (iter_ret == -1 && errno == ENOSYS) + _exit(PIDFD_SKIP); + } + } + + /* Small delay to let iterator start looping */ + usleep(50000); + + /* Create children with mount namespaces */ + for (i = 0; i < 8; i++) { + /* Create socketpair for synchronization */ + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); + + pid = create_child(&pidfds[i], CLONE_NEWNS); + ASSERT_NE(pid, -1); + + if (pid == 0) { + close(sv[i][0]); /* Close parent end */ + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) + _exit(1); + + /* Do some mount operations to make cleanup more interesting */ + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) + _exit(1); + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) + _exit(1); + + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) + _exit(1); + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) + _exit(1); + + /* Signal parent that setup is complete */ + if (write_nointr(sv[i][1], "R", 1) != 1) + _exit(1); + + /* Wait for parent to signal us to exit */ + if (read_nointr(sv[i][1], &c, 1) != 1) + _exit(1); + + close(sv[i][1]); + _exit(0); + } + + close(sv[i][1]); /* Close child end */ + } + + /* Wait for all children to finish setup */ + for (i = 0; i < 8; i++) { + ret = read_nointr(sv[i][0], &c, 1); + ASSERT_EQ(ret, 1); + ASSERT_EQ(c, 'R'); + } + + /* Kill children to trigger namespace destruction during iteration */ + for (i = 0; i < 8; i++) + write_nointr(sv[i][0], "X", 1); + + /* Wait for children and cleanup */ + for (i = 0; i < 8; i++) { + waitpid(-1, NULL, 0); + close(sv[i][0]); + close(pidfds[i]); + } + + /* Kill iterator and wait for it */ + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); + ret = waitpid(iter_pid, &status, 0); + ASSERT_EQ(ret, iter_pid); + close(iter_pidfd); + + /* Should have been killed */ + ASSERT_TRUE(WIFSIGNALED(status)); + ASSERT_EQ(WTERMSIG(status), SIGKILL); + + munmap(map, page_size); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c new file mode 100644 index 000000000000..da7d33f96397 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. + * + * The bug occurs when: + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) + * 2. Using pagination (req.ns_id != 0) + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of + * the filtered type, causing it to search the unified tree and potentially + * return a namespace of the wrong type. + */ +TEST(pagination_with_type_filter) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[10]; + int num_children = 10; + int i; + int sv[2]; + __u64 first_batch[3]; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* First batch - this should work */ + ret = sys_listns(&req, first_batch, 3, 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("First batch returned %zd entries", ret); + + if (ret == 3) { + __u64 second_batch[3]; + + /* Second batch - pagination triggers the bug */ + req.ns_id = first_batch[2]; /* Continue from last ID */ + ret = sys_listns(&req, second_batch, 3, 0); + + TH_LOG("Second batch returned %zd entries", ret); + ASSERT_GE(ret, 0); + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Cleanup */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c new file mode 100644 index 000000000000..82d818751a5f --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_permissions_test.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/capability.h> +#include <sys/ioctl.h> +#include <sys/prctl.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test that unprivileged users can only see namespaces they're currently in. + * Create a namespace, drop privileges, verify we can only see our own namespaces. + */ +TEST(listns_unprivileged_current_only) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + int unexpected_count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_netns_id; + bool found_ours; + int unexpected_count; + + close(pipefd[0]); + + /* Create user namespace to be unprivileged */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create a network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Now we're unprivileged - list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* We should only see our own network namespace */ + found_ours = false; + unexpected_count = 0; + + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_netns_id) { + found_ours = true; + } else { + /* This is either init_net (which we can see) or unexpected */ + unexpected_count++; + } + } + + /* Send results to parent */ + write(pipefd[1], &found_ours, sizeof(found_ours)); + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + unexpected_count = 0; + read(pipefd[0], &found_ours, sizeof(found_ours)); + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Child should have seen its own namespace */ + ASSERT_TRUE(found_ours); + + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", + unexpected_count); +} + +/* + * Test that users with CAP_SYS_ADMIN in a user namespace can see + * all namespaces owned by that user namespace. + */ +TEST(listns_cap_sys_admin_in_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Will be set to our created user namespace */ + }; + __u64 ns_ids[100]; + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 userns_id; + ssize_t ret; + int min_expected; + bool success; + + close(pipefd[0]); + + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get the user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create several namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + unshare(CLONE_NEWIPC); + + /* List namespaces owned by our user namespace */ + req.user_ns_id = userns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* + * We have CAP_SYS_ADMIN in this user namespace, + * so we should see all namespaces owned by it. + * That includes: net, uts, ipc, and the user namespace itself. + */ + min_expected = 4; + success = (ret >= min_expected); + + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", + count); +} + +/* + * Test that users cannot see namespaces from unrelated user namespaces. + * Create two sibling user namespaces, verify they can't see each other's + * owned namespaces. + */ +TEST(listns_cannot_see_sibling_userns_namespaces) +{ + int pipefd[2]; + pid_t pid1, pid2; + int status; + __u64 netns_a_id; + int pipefd2[2]; + bool found_sibling_netns; + + ASSERT_EQ(pipe(pipefd), 0); + + /* Fork first child - creates user namespace A */ + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + int fd; + __u64 netns_a_id; + char buf; + + close(pipefd[0]); + + /* Create user namespace A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create network namespace owned by user namespace A */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get network namespace ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send namespace ID to parent */ + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); + + /* Keep alive for sibling to check */ + read(pipefd[1], &buf, 1); + close(pipefd[1]); + exit(0); + } + + /* Parent reads namespace A ID */ + close(pipefd[1]); + netns_a_id = 0; + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); + + TH_LOG("User namespace A created network namespace with ID %llu", + (unsigned long long)netns_a_id); + + /* Fork second child - creates user namespace B */ + ASSERT_EQ(pipe(pipefd2), 0); + + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool found_sibling_netns; + + close(pipefd[0]); + close(pipefd2[0]); + + /* Create user namespace B (sibling to A) */ + if (setup_userns() < 0) { + close(pipefd2[1]); + exit(1); + } + + /* Try to list all network namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + found_sibling_netns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == netns_a_id) { + found_sibling_netns = true; + break; + } + } + } + + /* We should NOT see the sibling's network namespace */ + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[1]); + exit(0); + } + + /* Parent reads result from second child */ + close(pipefd2[1]); + found_sibling_netns = false; + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); + close(pipefd2[0]); + + /* Signal first child to exit */ + close(pipefd[0]); + + /* Wait for both children */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Second child should NOT have seen first child's namespace */ + ASSERT_FALSE(found_sibling_netns); + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); +} + +/* + * Test permission checking with LISTNS_CURRENT_USER. + * Verify that listing with LISTNS_CURRENT_USER respects permissions. + */ +TEST(listns_current_user_permissions) +{ + int pipefd[2]; + pid_t pid; + int status; + bool success; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + bool success; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + success = (ret >= 3); /* At least user, net, uts */ + write(pipefd[1], &success, sizeof(success)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + success = false; + count = 0; + read(pipefd[0], &success, sizeof(success)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(success); + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); +} + +/* + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing + * child user namespace's owned namespaces. + */ +TEST(listns_parent_userns_cap_sys_admin) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_child_userns; + ssize_t count; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 parent_userns_id; + __u64 child_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_child_userns; + + close(pipefd[0]); + + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get parent user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get child user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Create namespaces owned by child user namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* List namespaces owned by parent user namespace */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = 0; + req.spare2 = 0; + req.user_ns_id = parent_userns_id; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* Should see child user namespace in the list */ + found_child_userns = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == child_userns_id) { + found_child_userns = true; + break; + } + } + } + + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); + write(pipefd[1], &ret, sizeof(ret)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_child_userns = false; + count = 0; + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); + read(pipefd[0], &count, sizeof(count)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_child_userns); + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", + count); +} + +/* + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. + * This is different from seeing namespaces owned by a user namespace. + */ +TEST(listns_cap_sys_admin_inside_userns) +{ + int pipefd[2]; + pid_t pid; + int status; + bool found_ours; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 our_userns_id; + struct ns_id_req req; + __u64 ns_ids[100]; + ssize_t ret; + bool found_ours; + + close(pipefd[0]); + + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get our user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* List all user namespaces globally */ + req.size = sizeof(req); + req.spare = 0; + req.ns_id = 0; + req.ns_type = CLONE_NEWUSER; + req.spare2 = 0; + req.user_ns_id = 0; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + /* We should be able to see our own user namespace */ + found_ours = false; + if (ret > 0) { + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == our_userns_id) { + found_ours = true; + break; + } + } + } + + write(pipefd[1], &found_ours, sizeof(found_ours)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + found_ours = false; + read(pipefd[0], &found_ours, sizeof(found_ours)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(found_ours); + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); +} + +/* + * Test that dropping CAP_SYS_ADMIN restricts what we can see. + */ +TEST(listns_drop_cap_sys_admin) +{ + cap_t caps; + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; + + /* This test needs to start with CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (!caps) { + SKIP(return, "Cannot get capabilities"); + } + + cap_flag_value_t cap_val; + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { + cap_free(caps); + SKIP(return, "Cannot check CAP_SYS_ADMIN"); + } + + if (cap_val != CAP_SET) { + cap_free(caps); + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); + } + cap_free(caps); + + int pipefd[2]; + pid_t pid; + int status; + bool correct; + ssize_t count_before, count_after; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids_before[100]; + ssize_t count_before; + __u64 ns_ids_after[100]; + ssize_t count_after; + bool correct; + + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + /* Count namespaces with CAP_SYS_ADMIN */ + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + + /* Drop CAP_SYS_ADMIN */ + caps = cap_get_proc(); + if (caps) { + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); + cap_set_proc(caps); + cap_free(caps); + } + + /* Ensure we can't regain the capability */ + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* Count namespaces without CAP_SYS_ADMIN */ + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ + correct = (count_after <= count_before); + + write(pipefd[1], &correct, sizeof(correct)); + write(pipefd[1], &count_before, sizeof(count_before)); + write(pipefd[1], &count_after, sizeof(count_after)); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + + correct = false; + count_before = 0; + count_after = 0; + read(pipefd[0], &correct, sizeof(correct)); + read(pipefd[0], &count_before, sizeof(count_before)); + read(pipefd[0], &count_after, sizeof(count_after)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_TRUE(correct); + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", + count_before, count_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c new file mode 100644 index 000000000000..8a95789d6a87 --- /dev/null +++ b/tools/testing/selftests/namespaces/listns_test.c @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Test basic listns() functionality with the unified namespace tree. + * List all active namespaces globally. + */ +TEST(listns_basic_unified) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, /* Global listing */ + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + + /* Should find at least the initial namespaces */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active namespaces", ret); + + /* Verify all returned IDs are non-zero */ + for (ssize_t i = 0; i < ret; i++) { + ASSERT_NE(ns_ids[i], 0); + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } +} + +/* + * Test listns() with type filtering. + * List only network namespaces. + */ +TEST(listns_filter_by_type) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, /* Only network namespaces */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least init_net */ + ASSERT_GT(ret, 0); + TH_LOG("Found %zd active network namespaces", ret); + + /* Verify we can open each namespace and it's actually a network namespace */ + for (ssize_t i = 0; i < ret && i < 5; i++) { + struct nsfs_file_handle nsfh = { + .ns_id = ns_ids[i], + .ns_type = CLONE_NEWNET, + .ns_inum = 0, + }; + struct file_handle *fh; + int fd; + + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); + ASSERT_NE(fh, NULL); + fh->handle_bytes = sizeof(nsfh); + fh->handle_type = 0; + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); + + fd = open_by_handle_at(-10003, fh, O_RDONLY); + free(fh); + + if (fd >= 0) { + int ns_type; + /* Verify it's a network namespace via ioctl */ + ns_type = ioctl(fd, NS_GET_NSTYPE); + if (ns_type >= 0) { + ASSERT_EQ(ns_type, CLONE_NEWNET); + } + close(fd); + } + } +} + +/* + * Test listns() pagination. + * List namespaces in batches. + */ +TEST(listns_pagination) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 batch1[2], batch2[2]; + ssize_t ret1, ret2; + + /* Get first batch */ + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); + if (ret1 < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret1, 0); + + if (ret1 == 0) + SKIP(return, "No namespaces found"); + + TH_LOG("First batch: %zd namespaces", ret1); + + /* Get second batch using last ID from first batch */ + if (ret1 == ARRAY_SIZE(batch1)) { + req.ns_id = batch1[ret1 - 1]; + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); + ASSERT_GE(ret2, 0); + + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", + ret2, (unsigned long long)req.ns_id); + + /* If we got more results, verify IDs are monotonically increasing */ + if (ret2 > 0) { + ASSERT_GT(batch2[0], batch1[ret1 - 1]); + TH_LOG("Pagination working: %llu > %llu", + (unsigned long long)batch2[0], + (unsigned long long)batch1[ret1 - 1]); + } + } else { + TH_LOG("All namespaces fit in first batch"); + } +} + +/* + * Test listns() with LISTNS_CURRENT_USER. + * List namespaces owned by current user namespace. + */ +TEST(listns_current_user) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = LISTNS_CURRENT_USER, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + /* Should find at least the initial namespaces if we're in init_user_ns */ + TH_LOG("Found %zd namespaces owned by current user namespace", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that listns() only returns active namespaces. + * Create a namespace, let it become inactive, verify it's not listed. + */ +TEST(listns_only_active) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[100], ns_ids_after[100]; + ssize_t ret_before, ret_after; + int pipefd[2]; + pid_t pid; + __u64 new_ns_id = 0; + int status; + + /* Get initial list */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret_before, 0); + + TH_LOG("Before: %zd active network namespaces", ret_before); + + /* Create a new namespace in a child process and get its ID */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + + close(pipefd[0]); + + /* Create new network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get its ID */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(pipefd[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + write(pipefd[1], &ns_id, sizeof(ns_id)); + close(pipefd[1]); + + /* Keep namespace active briefly */ + usleep(100000); + exit(0); + } + + /* Parent reads the new namespace ID */ + { + int bytes; + + close(pipefd[1]); + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); + close(pipefd[0]); + + if (bytes == sizeof(new_ns_id)) { + __u64 ns_ids_during[100]; + int ret_during; + + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); + + /* List namespaces while child is still alive - should see new one */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + TH_LOG("During: %d active network namespaces", ret_during); + + /* Should have more namespaces than before */ + ASSERT_GE(ret_during, ret_before); + } + } + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + + /* Give time for namespace to become inactive */ + usleep(100000); + + /* List namespaces after child exits - should not see new one */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + TH_LOG("After: %zd active network namespaces", ret_after); + + /* Verify the new namespace ID is not in the after list */ + if (new_ns_id != 0) { + bool found = false; + + for (ssize_t i = 0; i < ret_after; i++) { + if (ns_ids_after[i] == new_ns_id) { + found = true; + break; + } + } + ASSERT_FALSE(found); + } +} + +/* + * Test listns() with specific user namespace ID. + * Create a user namespace and list namespaces it owns. + */ +TEST(listns_specific_userns) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, /* Will be filled with created userns ID */ + }; + __u64 ns_ids[100]; + int sv[2]; + pid_t pid; + int status; + __u64 user_ns_id = 0; + int bytes; + ssize_t ret; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + __u64 ns_id; + char buf; + + close(sv[0]); + + /* Create new user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Get user namespace ID */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send ID to parent */ + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { + close(sv[1]); + exit(1); + } + + /* Create some namespaces owned by this user namespace */ + unshare(CLONE_NEWNET); + unshare(CLONE_NEWUTS); + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); + + if (bytes != sizeof(user_ns_id)) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get user namespace ID from child"); + } + + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); + + /* List namespaces owned by this user namespace */ + req.user_ns_id = user_ns_id; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0) { + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + if (errno == ENOSYS) { + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, + (unsigned long long)user_ns_id); + + /* Should find at least the network and UTS namespaces we created */ + if (ret > 0) { + for (ssize_t i = 0; i < ret && i < 10; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); + } + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test listns() with multiple namespace types filter. + */ +TEST(listns_multiple_types) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[100]; + ssize_t ret; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(false); + } + ASSERT_GE(ret, 0); + + TH_LOG("Found %zd active network/UTS namespaces", ret); + + for (ssize_t i = 0; i < ret; i++) + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); +} + +/* + * Test that hierarchical active reference propagation keeps parent + * user namespaces visible in listns(). + */ +TEST(listns_hierarchical_visibility) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 parent_ns_id = 0, child_ns_id = 0; + int sv[2]; + pid_t pid; + int status; + int bytes; + __u64 ns_ids[100]; + ssize_t ret; + bool found_parent, found_child; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + char buf; + + close(sv[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(sv[1]); + exit(1); + } + + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { + close(fd); + close(sv[1]); + exit(1); + } + close(fd); + + /* Send both IDs to parent */ + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { + close(sv[1]); + exit(1); + } + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal */ + if (read(sv[1], &buf, 1) != 1) { + close(sv[1]); + exit(1); + } + close(sv[1]); + exit(0); + } + + /* Parent */ + close(sv[1]); + + /* Read both namespace IDs */ + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); + + if (bytes != (int)(2 * sizeof(__u64))) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to get namespace IDs from child"); + } + + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); + + /* List all user namespaces */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + + if (ret < 0 && errno == ENOSYS) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "listns() not supported"); + } + + ASSERT_GE(ret, 0); + TH_LOG("Found %zd active user namespaces", ret); + + /* Both parent and child should be visible (active due to child process) */ + found_parent = false; + found_child = false; + for (ssize_t i = 0; i < ret; i++) { + if (ns_ids[i] == parent_ns_id) + found_parent = true; + if (ns_ids[i] == child_ns_id) + found_child = true; + } + + TH_LOG("Parent namespace %s, child namespace %s", + found_parent ? "found" : "NOT FOUND", + found_child ? "found" : "NOT FOUND"); + + ASSERT_TRUE(found_child); + /* With hierarchical propagation, parent should also be active */ + ASSERT_TRUE(found_parent); + + /* Signal child to exit */ + if (write(sv[0], "X", 1) != 1) { + close(sv[0]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + ASSERT_TRUE(false); + } + close(sv[0]); + waitpid(pid, &status, 0); +} + +/* + * Test error cases for listns(). + */ +TEST(listns_error_cases) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[10]; + int ret; + + /* Test with invalid flags */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + + /* Test with NULL ns_ids array */ + ret = sys_listns(&req, NULL, 10, 0); + ASSERT_LT(ret, 0); + + /* Test with invalid spare field */ + req.spare = 1; + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + ASSERT_EQ(errno, EINVAL); + } + req.spare = 0; + + /* Test with huge nr_ns_ids */ + ret = sys_listns(&req, ns_ids, 2000000, 0); + if (errno == ENOSYS) { + /* listns() not supported, skip this check */ + } else { + ASSERT_LT(ret, 0); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c new file mode 100644 index 000000000000..093268f0efaa --- /dev/null +++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c @@ -0,0 +1,2672 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/nsfs.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <pthread.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test that initial namespaces can be reopened via file handle. + * Initial namespaces should have active ref count of 1 from boot. + */ +TEST(init_ns_always_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd1, fd2; + struct stat st1, st2; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open initial network namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + + /* Get file handle for initial namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(fd1); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + + /* Close the namespace fd */ + close(fd1); + + /* Try to reopen via file handle - should succeed since init ns is always active */ + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd2, 0); + + /* Verify we opened the same namespace */ + fd1 = open("/proc/1/ns/net", O_RDONLY); + ASSERT_GE(fd1, 0); + ASSERT_EQ(fstat(fd1, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(fd1); + close(fd2); + free(handle); +} + +/* + * Test namespace lifecycle: create a namespace in a child process, + * get a file handle while it's active, then try to reopen after + * the process exits (namespace becomes inactive). + */ +TEST(ns_inactive_after_exit) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + /* Create pipe for passing file handle from child */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Open our new namespace */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + /* Get file handle for the namespace */ + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Exit - namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + /* Read file handle from child */ + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should fail with ENOENT (namespace inactive) or ESTALE */ + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a process is using it, + * even after the creating process exits. + */ +TEST(ns_active_with_multiple_processes) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + int syncpipe[2]; + pid_t pid1, pid2; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + + /* Create pipes for communication */ + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + pid1 = fork(); + ASSERT_GE(pid1, 0); + + if (pid1 == 0) { + /* First child - creates namespace */ + close(pipefd[0]); + close(syncpipe[1]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Open and get handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + + /* Wait for signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent reads handle */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + ASSERT_GT(ret, 0); + + handle = (struct file_handle *)buf; + + /* Create second child that will keep namespace active */ + pid2 = fork(); + ASSERT_GE(pid2, 0); + + if (pid2 == 0) { + /* Second child - reopens the namespace */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Open the namespace via handle */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0) { + exit(1); + } + + /* Join the namespace */ + ret = setns(fd, CLONE_NEWNET); + close(fd); + if (ret < 0) { + exit(1); + } + + /* Sleep to keep namespace active */ + sleep(1); + exit(0); + } + + /* Let second child enter the namespace */ + usleep(100000); /* 100ms */ + + /* Signal first child to exit */ + close(syncpipe[0]); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for first child */ + waitpid(pid1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* Namespace should still be active because second child is using it */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd, 0); + close(fd); + + /* Wait for second child */ + waitpid(pid2, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); +} + +/* + * Test user namespace active ref tracking via credential lifecycle + */ +TEST(userns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Set up uid/gid mappings */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + } + + /* Get file handle */ + fd = open("/proc/self/ns/user", O_RDONLY); + if (fd < 0) { + close(pipefd[1]); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all tasks exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test PID namespace active ref tracking + */ +TEST(pidns_active_ref_lifecycle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + close(pipefd[1]); + exit(1); + } + + /* Fork to actually enter the PID namespace */ + pid_t child = fork(); + if (child < 0) { + close(pipefd[1]); + exit(1); + } + + if (child == 0) { + /* Grandchild - in new PID namespace */ + fd = open("/proc/self/ns/pid", O_RDONLY); + if (fd < 0) { + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + exit(1); + } + + /* Send handle to grandparent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child, NULL, 0); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* Namespace should be inactive after all processes exit */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that an open file descriptor keeps a namespace active. + * Even after the creating process exits, the namespace should remain + * active as long as an fd is held open. + */ +TEST(ns_fd_keeps_active) +{ + struct file_handle *handle; + int mount_id; + int ret; + int nsfd; + int pipe_child_ready[2]; + int pipe_parent_ready[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char sync_byte; + char proc_path[64]; + + ASSERT_EQ(pipe(pipe_child_ready), 0); + ASSERT_EQ(pipe(pipe_parent_ready), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipe_child_ready[0]); + close(pipe_parent_ready[1]); + + TH_LOG("Child: creating new network namespace"); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: network namespace created successfully"); + + /* Get file handle for the namespace */ + nsfd = open("/proc/self/ns/net", O_RDONLY); + if (nsfd < 0) { + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: opened namespace fd %d", nsfd); + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); + close(nsfd); + + if (ret < 0) { + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + exit(1); + } + + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); + + /* Send file handle to parent */ + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); + TH_LOG("Child: sent %d bytes of file handle to parent", ret); + close(pipe_child_ready[1]); + + /* Wait for parent to open the fd */ + TH_LOG("Child: waiting for parent to open fd"); + ret = read(pipe_parent_ready[0], &sync_byte, 1); + close(pipe_parent_ready[0]); + + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); + /* Exit - namespace should stay active because parent holds fd */ + exit(0); + } + + /* Parent process */ + close(pipe_child_ready[1]); + close(pipe_parent_ready[0]); + + TH_LOG("Parent: reading file handle from child"); + + /* Read file handle from child */ + ret = read(pipe_child_ready[0], buf, sizeof(buf)); + close(pipe_child_ready[0]); + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); + + /* Open the child's namespace while it's still alive */ + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); + TH_LOG("Parent: opening child's namespace at %s", proc_path); + nsfd = open(proc_path, O_RDONLY); + if (nsfd < 0) { + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); + close(pipe_parent_ready[1]); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child's namespace"); + } + + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); + + /* Signal child that we have the fd */ + sync_byte = 'G'; + write(pipe_parent_ready[1], &sync_byte, 1); + close(pipe_parent_ready[1]); + TH_LOG("Parent: signaled child that we have the fd"); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); + + /* + * Namespace should still be ACTIVE because we hold an fd. + * We should be able to reopen it via file handle. + */ + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(fd2, 0); + + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(fd2, &st2), 0); + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(fd2); + + /* Now close the fd - namespace should become inactive */ + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); + close(nsfd); + + /* Now reopening should fail - namespace is inactive */ + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd2, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test hierarchical active reference propagation. + * When a child namespace is active, its owning user namespace should also + * be active automatically due to hierarchical active reference propagation. + * This ensures parents are always reachable when children are active. + */ +TEST(ns_parent_always_reachable) +{ + struct file_handle *parent_handle, *child_handle; + int ret; + int child_nsfd; + int pipefd[2]; + pid_t pid; + int status; + __u64 parent_id, child_id; + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + TH_LOG("Child: creating parent user namespace and setting up mappings"); + + /* Create parent user namespace with mappings */ + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for parent user namespace */ + int parent_fd = open("/proc/self/ns/user", O_RDONLY); + if (parent_fd < 0) { + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened parent userns fd %d", parent_fd); + + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); + close(parent_fd); + close(pipefd[1]); + exit(1); + } + close(parent_fd); + + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); + + /* Create child user namespace within parent */ + TH_LOG("Child: creating nested child user namespace"); + ret = setup_userns(); + if (ret < 0) { + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); + + /* Get namespace ID for child user namespace */ + int child_fd = open("/proc/self/ns/user", O_RDONLY); + if (child_fd < 0) { + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); + close(pipefd[1]); + exit(1); + } + + TH_LOG("Child: opened child userns fd %d", child_fd); + + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); + close(child_fd); + close(pipefd[1]); + exit(1); + } + close(child_fd); + + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); + + /* Send both namespace IDs to parent */ + TH_LOG("Child: sending both namespace IDs to parent"); + write(pipefd[1], &parent_id, sizeof(parent_id)); + write(pipefd[1], &child_id, sizeof(child_id)); + close(pipefd[1]); + + TH_LOG("Child: exiting - parent userns should become inactive"); + /* Exit - parent user namespace should become inactive */ + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + TH_LOG("Parent: reading both namespace IDs from child"); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); + if (ret != sizeof(parent_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &child_id, sizeof(child_id)); + close(pipefd[0]); + if (ret != sizeof(child_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", + (unsigned long long)parent_id, (unsigned long long)child_id); + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)parent_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + parent_fh->ns_id = parent_id; + parent_fh->ns_type = 0; + parent_fh->ns_inum = 0; + + child_handle = (struct file_handle *)child_buf; + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); + child_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; + child_fh->ns_id = child_id; + child_fh->ns_type = 0; + child_fh->ns_inum = 0; + + TH_LOG("Parent: opening child namespace BEFORE child exits"); + + /* Open child namespace while child is still alive to keep it active */ + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); + if (child_nsfd < 0) { + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + TH_LOG("Opened child namespace fd %d", child_nsfd); + + /* Now wait for child to exit */ + TH_LOG("Parent: waiting for child to exit"); + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + TH_LOG("Child process exited, parent holds fd to child namespace"); + + /* + * With hierarchical active reference propagation: + * Since the child namespace is active (parent process holds fd), + * the parent user namespace should ALSO be active automatically. + * This is because when we took an active reference on the child, + * it propagated up to the owning user namespace. + */ + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd, 0); + + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); + + /* Verify we can also get parent via NS_GET_USERNS */ + TH_LOG("Verifying NS_GET_USERNS also works"); + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); + if (parent_fd2 < 0) { + close(parent_fd); + close(child_nsfd); + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); + SKIP(return, "NS_GET_USERNS not supported or failed"); + } + + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); + + /* Verify both methods give us the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(parent_fd, &st1), 0); + ASSERT_EQ(fstat(parent_fd2, &st2), 0); + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + /* + * Close child fd - parent should remain active because we still + * hold direct references to it (parent_fd and parent_fd2). + */ + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); + close(child_nsfd); + + /* Parent should still be openable */ + TH_LOG("Verifying parent still active via file handle"); + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(parent_fd3, 0); + close(parent_fd3); + + TH_LOG("Closing all fds to parent namespace"); + close(parent_fd); + close(parent_fd2); + + /* Both should now be inactive */ + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(parent_fd, 0); + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that bind mounts keep namespaces in the tree even when inactive + */ +TEST(ns_bind_mount_keeps_in_tree) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int pipefd[2]; + pid_t pid; + int status; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + char tmpfile[] = "/tmp/ns-test-XXXXXX"; + int tmpfd; + + /* Create temporary file for bind mount */ + tmpfd = mkstemp(tmpfile); + if (tmpfd < 0) { + SKIP(return, "Cannot create temporary file"); + } + close(tmpfd); + + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Unshare mount namespace and make mounts private to avoid propagation */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Bind mount the namespace */ + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); + if (ret < 0) { + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Get file handle */ + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + handle = (struct file_handle *)buf; + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); + close(fd); + + if (ret < 0) { + umount(tmpfile); + close(pipefd[1]); + unlink(tmpfile); + exit(1); + } + + /* Send handle to parent */ + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); + close(pipefd[1]); + exit(0); + } + + /* Parent */ + close(pipefd[1]); + ret = read(pipefd[0], buf, sizeof(buf)); + close(pipefd[0]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + ASSERT_GT(ret, 0); + handle = (struct file_handle *)buf; + + /* + * Namespace should be inactive but still in tree due to bind mount. + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). + */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(fd, 0); + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ + if (errno != ENOENT && errno != ESTALE) { + TH_LOG("Unexpected error: %d", errno); + } + + /* Cleanup */ + umount(tmpfile); + unlink(tmpfile); +} + +/* + * Test multi-level hierarchy (3+ levels deep). + * Grandparent → Parent → Child + * When child is active, both parent AND grandparent should be active. + */ +TEST(ns_multilevel_hierarchy) +{ + struct file_handle *gp_handle, *p_handle, *c_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 gp_id, p_id, c_id; + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create grandparent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int gp_fd = open("/proc/self/ns/user", O_RDONLY); + if (gp_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { + close(gp_fd); + close(pipefd[1]); + exit(1); + } + close(gp_fd); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c_fd = open("/proc/self/ns/user", O_RDONLY); + if (c_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { + close(c_fd); + close(pipefd[1]); + exit(1); + } + close(c_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &gp_id, sizeof(gp_id)); + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c_id, sizeof(c_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); + if (ret != sizeof(gp_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read grandparent namespace ID from child"); + } + + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID from child"); + } + + ret = read(pipefd[0], &c_id, sizeof(c_id)); + close(pipefd[0]); + if (ret != sizeof(c_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read child namespace ID from child"); + } + + /* Construct file handles from namespace IDs */ + gp_handle = (struct file_handle *)gp_buf; + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); + gp_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; + gp_fh->ns_id = gp_id; + gp_fh->ns_type = 0; + gp_fh->ns_inum = 0; + + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c_handle = (struct file_handle *)c_buf; + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; + c_fh->ns_id = c_id; + c_fh->ns_type = 0; + c_fh->ns_inum = 0; + + /* Open child before process exits */ + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); + if (c_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * With 3-level hierarchy and child active: + * - Child is active (we hold fd) + * - Parent should be active (propagated from child) + * - Grandparent should be active (propagated from parent) + */ + TH_LOG("Testing parent active when child is active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + + TH_LOG("Testing grandparent active when child is active"); + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); + ASSERT_GE(gp_fd, 0); + + close(c_fd); + close(p_fd); + close(gp_fd); +} + +/* + * Test multiple children sharing same parent. + * Parent should stay active as long as ANY child is active. + */ +TEST(ns_multiple_children_same_parent) +{ + struct file_handle *p_handle, *c1_handle, *c2_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 p_id, c1_id, c2_id; + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first child user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int c1_fd = open("/proc/self/ns/user", O_RDONLY); + if (c1_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { + close(c1_fd); + close(pipefd[1]); + exit(1); + } + close(c1_fd); + + /* Return to parent user namespace and create second child */ + /* We can't actually do this easily, so let's create a sibling namespace + * by creating a network namespace instead */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int c2_fd = open("/proc/self/ns/net", O_RDONLY); + if (c2_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { + close(c2_fd); + close(pipefd[1]); + exit(1); + } + close(c2_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &c1_id, sizeof(c1_id)); + write(pipefd[1], &c2_id, sizeof(c2_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); + if (ret != sizeof(c1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first child namespace ID"); + } + + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); + close(pipefd[0]); + if (ret != sizeof(c2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second child namespace ID"); + } + + /* Construct file handles from namespace IDs */ + p_handle = (struct file_handle *)p_buf; + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); + p_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + c1_handle = (struct file_handle *)c1_buf; + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; + c1_fh->ns_id = c1_id; + c1_fh->ns_type = 0; + c1_fh->ns_inum = 0; + + c2_handle = (struct file_handle *)c2_buf; + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + c2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; + c2_fh->ns_id = c2_id; + c2_fh->ns_type = 0; + c2_fh->ns_inum = 0; + + /* Open both children before process exits */ + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); + + if (c1_fd < 0 || c2_fd < 0) { + if (c1_fd >= 0) close(c1_fd); + if (c2_fd >= 0) close(c2_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open child namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (both children active) */ + TH_LOG("Both children active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first child - parent should STILL be active */ + TH_LOG("Closing first child - parent should still be active"); + close(c1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second child - NOW parent should become inactive */ + TH_LOG("Closing second child - parent should become inactive"); + close(c2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that different namespace types with same owner all contribute + * active references to the owning user namespace. + */ +TEST(ns_different_types_same_owner) +{ + struct file_handle *u_handle, *n_handle, *ut_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + /* Create network namespace (owned by user namespace) */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + /* Create UTS namespace (also owned by user namespace) */ + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + u_handle = (struct file_handle *)u_buf; + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); + u_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + n_handle = (struct file_handle *)n_buf; + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); + n_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + ut_handle = (struct file_handle *)ut_buf; + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ut_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces before process exits */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); + + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * Both network and UTS namespaces are active. + * User namespace should be active (gets 2 active refs). + */ + TH_LOG("Both net and uts active - user namespace should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close network namespace - user namespace should STILL be active */ + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close UTS namespace - user namespace should become inactive */ + TH_LOG("Closing uts ns - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* + * Test hierarchical propagation with deep namespace hierarchy. + * Create: init_user_ns -> user_A -> user_B -> net_ns + * When net_ns is active, both user_A and user_B should be active. + * This verifies the conditional recursion in __ns_ref_active_put() works. + */ +TEST(ns_deep_hierarchy_propagation) +{ + struct file_handle *ua_handle, *ub_handle, *net_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id, net_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A -> user_B -> net hierarchy */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + close(net_fd); + close(pipefd[1]); + exit(1); + } + close(net_fd); + + /* Send all three namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + write(pipefd[1], &net_id, sizeof(net_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + if (ret != sizeof(ub_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Open net_ns before child exits to keep it active */ + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + if (net_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open network namespace"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With net_ns active, both user_A and user_B should be active */ + TH_LOG("Testing user_B active (net_ns active causes propagation)"); + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd, 0); + + TH_LOG("Testing user_A active (propagated through user_B)"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close net_ns - user_B should stay active (we hold direct ref) */ + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); + close(net_fd); + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + ASSERT_GE(ub_fd2, 0); + close(ub_fd2); + + /* Close user_B - user_A should stay active (we hold direct ref) */ + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); + close(ub_fd); + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - everything should become inactive */ + TH_LOG("Closing user_A, all should become inactive"); + close(ua_fd); + + /* All should now be inactive */ + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test that parent stays active as long as ANY child is active. + * Create parent user namespace with two child net namespaces. + * Parent should remain active until BOTH children are inactive. + */ +TEST(ns_parent_multiple_children_refcount) +{ + struct file_handle *parent_handle, *net1_handle, *net2_handle; + int ret, pipefd[2], syncpipe[2]; + pid_t pid; + int status; + __u64 p_id, n1_id, n2_id; + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; + char sync_byte; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + close(syncpipe[1]); + + /* Create parent user namespace */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int p_fd = open("/proc/self/ns/user", O_RDONLY); + if (p_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { + close(p_fd); + close(pipefd[1]); + exit(1); + } + close(p_fd); + + /* Create first network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n1_fd = open("/proc/self/ns/net", O_RDONLY); + if (n1_fd < 0) { + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep n1_fd open so first namespace stays active */ + + /* Create second network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + + int n2_fd = open("/proc/self/ns/net", O_RDONLY); + if (n2_fd < 0) { + close(n1_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { + close(n1_fd); + close(n2_fd); + close(pipefd[1]); + close(syncpipe[0]); + exit(1); + } + /* Keep both n1_fd and n2_fd open */ + + /* Send all namespace IDs */ + write(pipefd[1], &p_id, sizeof(p_id)); + write(pipefd[1], &n1_id, sizeof(n1_id)); + write(pipefd[1], &n2_id, sizeof(n2_id)); + close(pipefd[1]); + + /* Wait for parent to signal before exiting */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + close(pipefd[1]); + close(syncpipe[0]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &p_id, sizeof(p_id)); + if (ret != sizeof(p_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read parent namespace ID"); + } + + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); + if (ret != sizeof(n1_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read first network namespace ID"); + } + + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); + close(pipefd[0]); + if (ret != sizeof(n2_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read second network namespace ID"); + } + + /* Construct file handles from namespace IDs */ + parent_handle = (struct file_handle *)p_buf; + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); + parent_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; + p_fh->ns_id = p_id; + p_fh->ns_type = 0; + p_fh->ns_inum = 0; + + net1_handle = (struct file_handle *)n1_buf; + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net1_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; + n1_fh->ns_id = n1_id; + n1_fh->ns_type = 0; + n1_fh->ns_inum = 0; + + net2_handle = (struct file_handle *)n2_buf; + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net2_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; + n2_fh->ns_id = n2_id; + n2_fh->ns_type = 0; + n2_fh->ns_inum = 0; + + /* Open both net namespaces while child is still alive */ + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); + if (n1_fd < 0 || n2_fd < 0) { + if (n1_fd >= 0) close(n1_fd); + if (n2_fd >= 0) close(n2_fd); + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open net namespaces"); + } + + /* Signal child that we have opened the namespaces */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Parent should be active (has 2 active children) */ + TH_LOG("Both net namespaces active - parent should be active"); + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close first net namespace - parent should STILL be active */ + TH_LOG("Closing first net ns - parent should still be active"); + close(n1_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_GE(p_fd, 0); + close(p_fd); + + /* Close second net namespace - parent should become inactive */ + TH_LOG("Closing second net ns - parent should become inactive"); + close(n2_fd); + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); + ASSERT_LT(p_fd, 0); +} + +/* + * Test that user namespace as a child also propagates correctly. + * Create user_A -> user_B, verify when user_B is active that user_A + * is also active. This is different from non-user namespace children. + */ +TEST(ns_userns_child_propagation) +{ + struct file_handle *ua_handle, *ub_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 ua_id, ub_id; + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + /* Create user_A */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ua_fd = open("/proc/self/ns/user", O_RDONLY); + if (ua_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { + close(ua_fd); + close(pipefd[1]); + exit(1); + } + close(ua_fd); + + /* Create user_B (child of user_A) */ + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int ub_fd = open("/proc/self/ns/user", O_RDONLY); + if (ub_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { + close(ub_fd); + close(pipefd[1]); + exit(1); + } + close(ub_fd); + + /* Send both namespace IDs */ + write(pipefd[1], &ua_id, sizeof(ua_id)); + write(pipefd[1], &ub_id, sizeof(ub_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read both namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); + if (ret != sizeof(ua_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_A namespace ID"); + } + + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); + close(pipefd[0]); + if (ret != sizeof(ub_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user_B namespace ID"); + } + + /* Construct file handles from namespace IDs */ + ua_handle = (struct file_handle *)ua_buf; + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ua_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; + ua_fh->ns_id = ua_id; + ua_fh->ns_type = 0; + ua_fh->ns_inum = 0; + + ub_handle = (struct file_handle *)ub_buf; + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); + ub_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; + ub_fh->ns_id = ub_id; + ub_fh->ns_type = 0; + ub_fh->ns_inum = 0; + + /* Open user_B before child exits */ + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); + if (ub_fd < 0) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open user_B"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* With user_B active, user_A should also be active */ + TH_LOG("Testing user_A active when child user_B is active"); + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd, 0); + + /* Close user_B */ + TH_LOG("Closing user_B"); + close(ub_fd); + + /* user_A should remain active (we hold direct ref) */ + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_GE(ua_fd2, 0); + close(ua_fd2); + + /* Close user_A - should become inactive */ + TH_LOG("Closing user_A - should become inactive"); + close(ua_fd); + + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); + ASSERT_LT(ua_fd, 0); +} + +/* + * Test different namespace types (net, uts, ipc) all contributing + * active references to the same owning user namespace. + */ +TEST(ns_mixed_types_same_owner) +{ + struct file_handle *user_handle, *net_handle, *uts_handle; + int ret, pipefd[2]; + pid_t pid; + int status; + __u64 u_id, n_id, ut_id; + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + close(pipefd[0]); + + if (setup_userns() < 0) { + close(pipefd[1]); + exit(1); + } + + int u_fd = open("/proc/self/ns/user", O_RDONLY); + if (u_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { + close(u_fd); + close(pipefd[1]); + exit(1); + } + close(u_fd); + + if (unshare(CLONE_NEWNET) < 0) { + close(pipefd[1]); + exit(1); + } + + int n_fd = open("/proc/self/ns/net", O_RDONLY); + if (n_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { + close(n_fd); + close(pipefd[1]); + exit(1); + } + close(n_fd); + + if (unshare(CLONE_NEWUTS) < 0) { + close(pipefd[1]); + exit(1); + } + + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); + if (ut_fd < 0) { + close(pipefd[1]); + exit(1); + } + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { + close(ut_fd); + close(pipefd[1]); + exit(1); + } + close(ut_fd); + + /* Send all namespace IDs */ + write(pipefd[1], &u_id, sizeof(u_id)); + write(pipefd[1], &n_id, sizeof(n_id)); + write(pipefd[1], &ut_id, sizeof(ut_id)); + close(pipefd[1]); + exit(0); + } + + close(pipefd[1]); + + /* Read all three namespace IDs - fixed size, no parsing needed */ + ret = read(pipefd[0], &u_id, sizeof(u_id)); + if (ret != sizeof(u_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID"); + } + + ret = read(pipefd[0], &n_id, sizeof(n_id)); + if (ret != sizeof(n_id)) { + close(pipefd[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID"); + } + + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); + close(pipefd[0]); + if (ret != sizeof(ut_id)) { + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read UTS namespace ID"); + } + + /* Construct file handles from namespace IDs */ + user_handle = (struct file_handle *)u_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; + u_fh->ns_id = u_id; + u_fh->ns_type = 0; + u_fh->ns_inum = 0; + + net_handle = (struct file_handle *)n_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; + n_fh->ns_id = n_id; + n_fh->ns_type = 0; + n_fh->ns_inum = 0; + + uts_handle = (struct file_handle *)ut_buf; + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); + uts_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; + ut_fh->ns_id = ut_id; + ut_fh->ns_type = 0; + ut_fh->ns_inum = 0; + + /* Open both non-user namespaces */ + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); + if (n_fd < 0 || ut_fd < 0) { + if (n_fd >= 0) close(n_fd); + if (ut_fd >= 0) close(ut_fd); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to open namespaces"); + } + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* User namespace should be active (2 active children) */ + TH_LOG("Both net and uts active - user ns should be active"); + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close net - user ns should STILL be active (uts still active) */ + TH_LOG("Closing net - user ns should still be active"); + close(n_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(u_fd, 0); + close(u_fd); + + /* Close uts - user ns should become inactive */ + TH_LOG("Closing uts - user ns should become inactive"); + close(ut_fd); + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(u_fd, 0); +} + +/* Thread test helpers and structures */ +struct thread_ns_info { + __u64 ns_id; + int pipefd; + int syncfd_read; + int syncfd_write; + int exit_code; +}; + +static void *thread_create_namespace(void *arg) +{ + struct thread_ns_info *info = (struct thread_ns_info *)arg; + int ret; + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + info->exit_code = 1; + return NULL; + } + + /* Get namespace ID */ + int fd = open("/proc/thread-self/ns/net", O_RDONLY); + if (fd < 0) { + info->exit_code = 2; + return NULL; + } + + ret = ioctl(fd, NS_GET_ID, &info->ns_id); + close(fd); + if (ret < 0) { + info->exit_code = 3; + return NULL; + } + + /* Send namespace ID to main thread */ + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { + info->exit_code = 4; + return NULL; + } + + /* Wait for signal to exit */ + char sync_byte; + if (read(info->syncfd_read, &sync_byte, 1) != 1) { + info->exit_code = 5; + return NULL; + } + + info->exit_code = 0; + return NULL; +} + +/* + * Test that namespace becomes inactive after thread exits. + * This verifies active reference counting works with threads, not just processes. + */ +TEST(thread_ns_inactive_after_exit) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Namespace should be active while thread is alive */ + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + close(nsfd); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + ASSERT_EQ(pthread_join(thread, NULL), 0); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) + SKIP(return, "Thread failed to create namespace"); + + TH_LOG("Thread exited, namespace should be inactive"); + + /* Namespace should now be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* + * Test that a namespace remains active while a thread holds an fd to it. + * Even after the thread exits, the namespace should remain active as long as + * another thread holds a file descriptor to it. + */ +TEST(thread_ns_fd_keeps_active) +{ + pthread_t thread; + struct thread_ns_info info; + struct file_handle *handle; + int pipefd[2]; + int syncpipe[2]; + int ret; + char sync_byte; + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(pipe(syncpipe), 0); + + info.pipefd = pipefd[1]; + info.syncfd_read = syncpipe[0]; + info.syncfd_write = -1; + info.exit_code = -1; + + /* Create thread that will create a namespace */ + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); + ASSERT_EQ(ret, 0); + + /* Read namespace ID from thread */ + __u64 ns_id; + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); + if (ret != sizeof(ns_id)) { + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + close(syncpipe[1]); + SKIP(return, "Failed to read namespace ID from thread"); + } + + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); + + /* Construct file handle */ + handle = (struct file_handle *)buf; + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; + fh->ns_id = ns_id; + fh->ns_type = 0; + fh->ns_inum = 0; + + /* Open namespace while thread is alive */ + TH_LOG("Opening namespace while thread is alive"); + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd, 0); + + /* Signal thread to exit */ + TH_LOG("Signaling thread to exit"); + sync_byte = 'X'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + /* Wait for thread to exit */ + pthread_join(thread, NULL); + close(pipefd[0]); + close(pipefd[1]); + close(syncpipe[0]); + + if (info.exit_code != 0) { + close(nsfd); + SKIP(return, "Thread failed to create namespace"); + } + + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); + + /* Namespace should still be active because we hold an fd */ + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_GE(nsfd2, 0); + + /* Verify it's the same namespace */ + struct stat st1, st2; + ASSERT_EQ(fstat(nsfd, &st1), 0); + ASSERT_EQ(fstat(nsfd2, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + close(nsfd2); + + TH_LOG("Closing fd - namespace should become inactive"); + close(nsfd); + + /* Now namespace should be inactive */ + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(nsfd, 0); + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); +} + +/* Structure for thread data in subprocess */ +struct thread_sleep_data { + int syncfd_read; +}; + +static void *thread_sleep_and_wait(void *arg) +{ + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; + char sync_byte; + + /* Wait for signal to exit - read will unblock when pipe is closed */ + (void)read(data->syncfd_read, &sync_byte, 1); + return NULL; +} + +/* + * Test that namespaces become inactive after subprocess with multiple threads exits. + * Create a subprocess that unshares user and network namespaces, then creates two + * threads that share those namespaces. Verify that after all threads and subprocess + * exit, the namespaces are no longer listed by listns() and cannot be opened by + * open_by_handle_at(). + */ +TEST(thread_subprocess_ns_inactive_after_all_exit) +{ + int pipefd[2]; + int sv[2]; + pid_t pid; + int status; + __u64 user_id, net_id; + struct file_handle *user_handle, *net_handle; + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; + char sync_byte; + int ret; + + ASSERT_EQ(pipe(pipefd), 0); + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + close(sv[0]); + + /* Create user namespace with mappings */ + if (setup_userns() < 0) { + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: setup_userns() succeeded\n"); + + /* Get user namespace ID */ + int user_fd = open("/proc/self/ns/user", O_RDONLY); + if (user_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); + close(user_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(user_fd); + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); + + /* Unshare network namespace */ + if (unshare(CLONE_NEWNET) < 0) { + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); + + /* Get network namespace ID */ + int net_fd = open("/proc/self/ns/net", O_RDONLY); + if (net_fd < 0) { + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); + close(net_fd); + close(pipefd[1]); + close(sv[1]); + exit(1); + } + close(net_fd); + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); + + /* Send namespace IDs to parent */ + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); + exit(1); + } + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); + exit(1); + } + close(pipefd[1]); + fprintf(stderr, "Child: sent namespace IDs to parent\n"); + + /* Create two threads that share the namespaces */ + pthread_t thread1, thread2; + struct thread_sleep_data data; + data.syncfd_read = sv[1]; + + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + exit(1); + } + fprintf(stderr, "Child: created thread1\n"); + + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); + if (ret_thread != 0) { + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); + close(sv[1]); + pthread_cancel(thread1); + exit(1); + } + fprintf(stderr, "Child: created thread2\n"); + + /* Wait for threads to complete - they will unblock when parent writes */ + fprintf(stderr, "Child: waiting for threads to exit\n"); + pthread_join(thread1, NULL); + fprintf(stderr, "Child: thread1 exited\n"); + pthread_join(thread2, NULL); + fprintf(stderr, "Child: thread2 exited\n"); + + close(sv[1]); + + /* Exit - namespaces should become inactive */ + fprintf(stderr, "Child: all threads joined, exiting with success\n"); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + close(sv[1]); + + TH_LOG("Parent: waiting to read namespace IDs from child"); + + /* Read namespace IDs from child */ + ret = read(pipefd[0], &user_id, sizeof(user_id)); + if (ret != sizeof(user_id)) { + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); + close(pipefd[0]); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read user namespace ID from child"); + } + + ret = read(pipefd[0], &net_id, sizeof(net_id)); + close(pipefd[0]); + if (ret != sizeof(net_id)) { + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); + sync_byte = 'X'; + (void)write(sv[0], &sync_byte, 1); + close(sv[0]); + waitpid(pid, NULL, 0); + SKIP(return, "Failed to read network namespace ID from child"); + } + + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", + (unsigned long long)user_id, (unsigned long long)net_id); + + /* Construct file handles */ + user_handle = (struct file_handle *)user_buf; + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); + user_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; + user_fh->ns_id = user_id; + user_fh->ns_type = 0; + user_fh->ns_inum = 0; + + net_handle = (struct file_handle *)net_buf; + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); + net_handle->handle_type = FILEID_NSFS; + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; + net_fh->ns_id = net_id; + net_fh->ns_type = 0; + net_fh->ns_inum = 0; + + /* Verify namespaces are active while subprocess and threads are alive */ + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_GE(user_fd, 0); + + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_GE(net_fd, 0); + + close(user_fd); + close(net_fd); + + /* Also verify they appear in listns() */ + TH_LOG("Verifying namespaces appear in listns() while active"); + struct ns_id_req req = { + .size = sizeof(struct ns_id_req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids < 0) { + TH_LOG("listns() not available, skipping listns verification"); + } else { + /* Check if user_id is in the list */ + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_TRUE(found_user); + TH_LOG("User namespace found in listns() as expected"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_TRUE(found_net); + TH_LOG("Network namespace found in listns() as expected"); + } + } + + /* Signal threads to exit */ + TH_LOG("Signaling threads to exit"); + sync_byte = 'X'; + /* Write two bytes - one for each thread */ + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); + close(sv[0]); + + /* Wait for child process to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + if (WEXITSTATUS(status) != 0) { + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); + SKIP(return, "Child process failed"); + } + + TH_LOG("Subprocess and all threads have exited successfully"); + + /* Verify namespaces are now inactive - open_by_handle_at should fail */ + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); + ASSERT_LT(user_fd, 0); + TH_LOG("User namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); + ASSERT_LT(net_fd, 0); + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", + strerror(errno), errno); + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); + + /* Verify namespaces do NOT appear in listns() */ + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); + memset(&req, 0, sizeof(req)); + req.size = sizeof(struct ns_id_req); + req.ns_type = CLONE_NEWUSER; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_user = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == user_id) { + found_user = 1; + break; + } + } + ASSERT_FALSE(found_user); + TH_LOG("User namespace correctly not listed in listns()"); + + /* Check network namespace */ + req.ns_type = CLONE_NEWNET; + nr_ids = sys_listns(&req, ns_ids, 256, 0); + if (nr_ids >= 0) { + int found_net = 0; + for (int i = 0; i < nr_ids; i++) { + if (ns_ids[i] == net_id) { + found_net = 1; + break; + } + } + ASSERT_FALSE(found_net); + TH_LOG("Network namespace correctly not listed in listns()"); + } + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c index e28accd74a57..527ade0a8673 100644 --- a/tools/testing/selftests/namespaces/nsid_test.c +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -6,6 +6,7 @@ #include <libgen.h> #include <limits.h> #include <pthread.h> +#include <signal.h> #include <string.h> #include <sys/mount.h> #include <poll.h> @@ -14,12 +15,30 @@ #include <sys/stat.h> #include <sys/socket.h> #include <sys/un.h> +#include <sys/wait.h> #include <unistd.h> #include <linux/fs.h> #include <linux/limits.h> #include <linux/nsfs.h> #include "../kselftest_harness.h" +/* Fixture for tests that create child processes */ +FIXTURE(nsid) { + pid_t child_pid; +}; + +FIXTURE_SETUP(nsid) { + self->child_pid = 0; +} + +FIXTURE_TEARDOWN(nsid) { + /* Clean up any child process that may still be running */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } +} + TEST(nsid_mntns_basic) { __u64 mnt_ns_id = 0; @@ -44,7 +63,7 @@ TEST(nsid_mntns_basic) close(fd_mntns); } -TEST(nsid_mntns_separate) +TEST_F(nsid, mntns_separate) { __u64 parent_mnt_ns_id = 0; __u64 child_mnt_ns_id = 0; @@ -90,6 +109,9 @@ TEST(nsid_mntns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -99,8 +121,6 @@ TEST(nsid_mntns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_mntns); SKIP(return, "No permission to create mount namespace"); } @@ -123,10 +143,6 @@ TEST(nsid_mntns_separate) close(fd_parent_mntns); close(fd_child_mntns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_cgroupns_basic) @@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic) close(fd_cgroupns); } -TEST(nsid_cgroupns_separate) +TEST_F(nsid, cgroupns_separate) { __u64 parent_cgroup_ns_id = 0; __u64 child_cgroup_ns_id = 0; @@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_cgroupns); SKIP(return, "No permission to create cgroup namespace"); } @@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate) close(fd_parent_cgroupns); close(fd_child_cgroupns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_ipcns_basic) @@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic) close(fd_ipcns); } -TEST(nsid_ipcns_separate) +TEST_F(nsid, ipcns_separate) { __u64 parent_ipc_ns_id = 0; __u64 child_ipc_ns_id = 0; @@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_ipcns); SKIP(return, "No permission to create IPC namespace"); } @@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate) close(fd_parent_ipcns); close(fd_child_ipcns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_utsns_basic) @@ -371,7 +381,7 @@ TEST(nsid_utsns_basic) close(fd_utsns); } -TEST(nsid_utsns_separate) +TEST_F(nsid, utsns_separate) { __u64 parent_uts_ns_id = 0; __u64 child_uts_ns_id = 0; @@ -417,6 +427,9 @@ TEST(nsid_utsns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -426,8 +439,6 @@ TEST(nsid_utsns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_utsns); SKIP(return, "No permission to create UTS namespace"); } @@ -450,10 +461,6 @@ TEST(nsid_utsns_separate) close(fd_parent_utsns); close(fd_child_utsns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_userns_basic) @@ -480,7 +487,7 @@ TEST(nsid_userns_basic) close(fd_userns); } -TEST(nsid_userns_separate) +TEST_F(nsid, userns_separate) { __u64 parent_user_ns_id = 0; __u64 child_user_ns_id = 0; @@ -526,6 +533,9 @@ TEST(nsid_userns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -535,8 +545,6 @@ TEST(nsid_userns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_userns); SKIP(return, "No permission to create user namespace"); } @@ -559,10 +567,6 @@ TEST(nsid_userns_separate) close(fd_parent_userns); close(fd_child_userns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_timens_basic) @@ -591,7 +595,7 @@ TEST(nsid_timens_basic) close(fd_timens); } -TEST(nsid_timens_separate) +TEST_F(nsid, timens_separate) { __u64 parent_time_ns_id = 0; __u64 child_time_ns_id = 0; @@ -652,6 +656,9 @@ TEST(nsid_timens_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -660,8 +667,6 @@ TEST(nsid_timens_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_timens); close(pipefd[0]); SKIP(return, "Cannot create time namespace"); @@ -689,10 +694,6 @@ TEST(nsid_timens_separate) close(fd_parent_timens); close(fd_child_timens); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_pidns_basic) @@ -719,7 +720,7 @@ TEST(nsid_pidns_basic) close(fd_pidns); } -TEST(nsid_pidns_separate) +TEST_F(nsid, pidns_separate) { __u64 parent_pid_ns_id = 0; __u64 child_pid_ns_id = 0; @@ -776,6 +777,9 @@ TEST(nsid_pidns_separate) } } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -784,8 +788,6 @@ TEST(nsid_pidns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_pidns); close(pipefd[0]); SKIP(return, "No permission to create PID namespace"); @@ -813,10 +815,6 @@ TEST(nsid_pidns_separate) close(fd_parent_pidns); close(fd_child_pidns); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST(nsid_netns_basic) @@ -860,7 +858,7 @@ TEST(nsid_netns_basic) close(fd_netns); } -TEST(nsid_netns_separate) +TEST_F(nsid, netns_separate) { __u64 parent_net_ns_id = 0; __u64 parent_netns_cookie = 0; @@ -920,6 +918,9 @@ TEST(nsid_netns_separate) _exit(0); } + /* Track child for cleanup */ + self->child_pid = pid; + /* Parent process */ close(pipefd[1]); @@ -929,8 +930,6 @@ TEST(nsid_netns_separate) if (buf == 'S') { /* Child couldn't create namespace, skip test */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); close(fd_parent_netns); close(parent_sock); SKIP(return, "No permission to create network namespace"); @@ -977,10 +976,6 @@ TEST(nsid_netns_separate) close(fd_parent_netns); close(fd_child_netns); close(parent_sock); - - /* Clean up child process */ - kill(pid, SIGTERM); - waitpid(pid, NULL, 0); } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c new file mode 100644 index 000000000000..753fd29dffd8 --- /dev/null +++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "../pidfd/pidfd.h" +#include "../kselftest_harness.h" + +/* + * Regression tests for the setns(pidfd) active reference counting bug. + * + * These tests are based on the reproducers that triggered the race condition + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). + * + * The bug: When using setns() with a pidfd, if the target task exits between + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. + * Then ns_ref_active_get() would increment from 0 without properly resurrecting + * the owner chain, causing active reference count underflows. + */ + +/* + * Simple pidfd setns test using create_child()+unshare(). + * + * Without the fix, this would trigger active refcount warnings when the + * parent exits after doing setns(pidfd) on a child that has already exited. + */ +TEST(simple_pidfd_setns) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + int sv[2]; + char c; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create a child process without namespaces initially */ + child_pid = create_child(&pidfd, 0); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + close(sv[0]); + + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { + close(sv[1]); + _exit(1); + } + + /* Signal parent that namespaces are ready */ + if (write_nointr(sv[1], "1", 1) < 0) { + close(sv[1]); + _exit(1); + } + + close(sv[1]); + _exit(0); + } + ASSERT_GE(pidfd, 0); + EXPECT_EQ(close(sv[1]), 0); + + ret = read_nointr(sv[0], &c, 1); + ASSERT_EQ(ret, 1); + EXPECT_EQ(close(sv[0]), 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + TH_LOG("setns() returned %d", ret); + close(pidfd); +} + +/* + * Simple pidfd setns test using create_child(). + * + * This variation uses create_child() with namespace flags directly. + * Namespaces are created immediately at clone time. + */ +TEST(simple_pidfd_setns_clone) +{ + pid_t child_pid; + int pidfd = -1; + int ret; + + /* Ignore SIGCHLD for autoreap */ + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); + + /* Create a child process with new namespaces using create_child() */ + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child: sleep for a while so parent can setns to us */ + sleep(2); + _exit(0); + } + + /* Parent: pidfd was already created by create_child() */ + ASSERT_GE(pidfd, 0); + + /* Set to child's namespaces via pidfd */ + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); + close(pidfd); + TH_LOG("setns() returned %d", ret); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c new file mode 100644 index 000000000000..ba689a22d82f --- /dev/null +++ b/tools/testing/selftests/namespaces/siocgskns_test.c @@ -0,0 +1,1824 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/if.h> +#include <linux/sockios.h> +#include <linux/nsfs.h> +#include <arpa/inet.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C +#endif + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 +#endif + +#ifndef FILEID_NSFS +#define FILEID_NSFS 0xf1 +#endif + +/* + * Test basic SIOCGSKNS functionality. + * Create a socket and verify SIOCGSKNS returns the correct network namespace. + */ +TEST(siocgskns_basic) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create a TCP socket */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS to get network namespace */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get current network namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + /* Verify they match */ + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket file descriptors keep network namespaces active. + * Create a network namespace, create a socket in it, then exit the namespace. + * The namespace should remain active while the socket FD is held. + */ +TEST(siocgskns_keeps_netns_active) +{ + int sock_fd, netns_fd, test_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Create a socket in the new network namespace */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + TH_LOG("socket() failed: %s", strerror(errno)); + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); + + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + + /* + * Namespace should still be active because socket FD keeps it alive. + * Try to access it via /proc/self/fd/<fd>. + */ + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); + test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + + /* Try SIOCGSKNS again - should fail since socket is closed */ + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); +} + +/* + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). + */ +TEST(siocgskns_socket_types) +{ + int sock_tcp, sock_udp, sock_raw; + int netns_tcp, netns_udp, netns_raw; + struct stat st_tcp, st_udp, st_raw; + + /* TCP socket */ + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_tcp, 0); + + /* UDP socket */ + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); + ASSERT_GE(sock_udp, 0); + + /* RAW socket (may require privileges) */ + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { + sock_raw = -1; /* Skip raw socket test */ + } + + /* Test SIOCGSKNS on TCP */ + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); + if (netns_tcp < 0) { + close(sock_tcp); + close(sock_udp); + if (sock_raw >= 0) close(sock_raw); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_tcp, 0); + } + + /* Test SIOCGSKNS on UDP */ + netns_udp = ioctl(sock_udp, SIOCGSKNS); + ASSERT_GE(netns_udp, 0); + + /* Test SIOCGSKNS on RAW (if available) */ + if (sock_raw >= 0) { + netns_raw = ioctl(sock_raw, SIOCGSKNS); + ASSERT_GE(netns_raw, 0); + } + + /* Verify all return the same network namespace */ + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); + + if (sock_raw >= 0) { + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); + close(netns_raw); + close(sock_raw); + } + + close(netns_tcp); + close(netns_udp); + close(sock_tcp); + close(sock_udp); +} + +/* + * Test SIOCGSKNS across setns. + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still + * returns netns A. + */ +TEST(siocgskns_across_setns) +{ + int sock_fd, netns_a_fd, netns_b_fd, result_fd; + struct stat st_a; + + /* Get current netns (A) */ + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_a_fd, 0); + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); + + /* Create socket in netns A */ + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Create new netns (B) */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(netns_b_fd, 0); + + /* Get netns from socket created in A */ + result_fd = ioctl(sock_fd, SIOCGSKNS); + if (result_fd < 0) { + close(sock_fd); + setns(netns_a_fd, CLONE_NEWNET); + close(netns_a_fd); + close(netns_b_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(result_fd, 0); + } + + /* Verify it still points to netns A */ + struct stat st_result_stat; + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); + + close(result_fd); + close(sock_fd); + close(netns_b_fd); + + /* Restore original netns */ + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); + close(netns_a_fd); +} + +/* + * Test SIOCGSKNS fails on non-socket file descriptors. + */ +TEST(siocgskns_non_socket) +{ + int fd; + int pipefd[2]; + + /* Test on regular file */ + fd = open("/dev/null", O_RDONLY); + ASSERT_GE(fd, 0); + + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + close(fd); + + /* Test on pipe */ + ASSERT_EQ(pipe(pipefd), 0); + + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); + + close(pipefd[0]); + close(pipefd[1]); +} + +/* + * Test multiple sockets keep the same network namespace active. + * Create multiple sockets, verify closing some doesn't affect others. + */ +TEST(siocgskns_multiple_sockets) +{ + int socks[5]; + int netns_fds[5]; + int i; + struct stat st; + ino_t netns_ino; + + /* Create new network namespace */ + ASSERT_EQ(unshare(CLONE_NEWNET), 0); + + /* Create multiple sockets */ + for (i = 0; i < 5; i++) { + socks[i] = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_GE(socks[i], 0); + } + + /* Get netns from all sockets */ + for (i = 0; i < 5; i++) { + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); + if (netns_fds[i] < 0) { + int j; + for (j = 0; j <= i; j++) { + close(socks[j]); + if (j < i && netns_fds[j] >= 0) + close(netns_fds[j]); + } + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fds[i], 0); + } + } + + /* Verify all point to same netns */ + ASSERT_EQ(fstat(netns_fds[0], &st), 0); + netns_ino = st.st_ino; + + for (i = 1; i < 5; i++) { + ASSERT_EQ(fstat(netns_fds[i], &st), 0); + ASSERT_EQ(st.st_ino, netns_ino); + } + + /* Close some sockets */ + for (i = 0; i < 3; i++) { + close(socks[i]); + } + + /* Remaining netns FDs should still be valid */ + for (i = 3; i < 5; i++) { + char path[64]; + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); + int test_fd = open(path, O_RDONLY); + ASSERT_GE(test_fd, 0); + close(test_fd); + } + + /* Cleanup */ + for (i = 0; i < 5; i++) { + if (i >= 3) + close(socks[i]); + close(netns_fds[i]); + } +} + +/* + * Test socket keeps netns active after creating process exits. + * Verify that as long as the socket FD exists, the namespace remains active. + */ +TEST(siocgskns_netns_lifecycle) +{ + int sock_fd, netns_fd; + int ipc_sockets[2]; + int syncpipe[2]; + pid_t pid; + int status; + char sync_byte; + struct stat st; + ino_t netns_ino; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + ASSERT_EQ(pipe(syncpipe), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child */ + close(ipc_sockets[0]); + close(syncpipe[1]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + /* Send socket to parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + close(syncpipe[0]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + + /* Wait for parent signal */ + read(syncpipe[0], &sync_byte, 1); + close(syncpipe[0]); + exit(0); + } + + /* Parent */ + close(ipc_sockets[1]); + close(syncpipe[0]); + + /* Receive socket FD */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Get netns from socket while child is alive */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + close(sock_fd); + waitpid(pid, NULL, 0); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Signal child to exit */ + sync_byte = 'G'; + write(syncpipe[1], &sync_byte, 1); + close(syncpipe[1]); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + + /* + * Socket FD should still keep namespace active even after + * the creating process exited. + */ + int test_fd = ioctl(sock_fd, SIOCGSKNS); + ASSERT_GE(test_fd, 0); + + struct stat st_test; + ASSERT_EQ(fstat(test_fd, &st_test), 0); + ASSERT_EQ(st_test.st_ino, netns_ino); + + close(test_fd); + close(netns_fd); + + /* Close socket - namespace should become inactive */ + close(sock_fd); +} + +/* + * Test IPv6 sockets also work with SIOCGSKNS. + */ +TEST(siocgskns_ipv6) +{ + int sock_fd, netns_fd, current_netns_fd; + struct stat st1, st2; + + /* Create an IPv6 TCP socket */ + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); + ASSERT_GE(sock_fd, 0); + + /* Use SIOCGSKNS */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify it matches current namespace */ + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(current_netns_fd, 0); + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + + close(sock_fd); + close(netns_fd); + close(current_netns_fd); +} + +/* + * Test that socket-kept netns appears in listns() output. + * Verify that a network namespace kept alive by a socket FD appears in + * listns() output even after the creating process exits, and that it + * disappears when the socket is closed. + */ +TEST(siocgskns_listns_visibility) +{ + int sock_fd, netns_fd, owner_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + __u64 netns_id, owner_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + owner_fd = ioctl(netns_fd, NS_GET_USERNS); + if (owner_fd < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(owner_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); + if (ret < 0) { + close(owner_fd); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(owner_fd); + + /* Namespace should appear in listns() output */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + /* Search for our network namespace in the list */ + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); + + /* Now verify with owner filtering */ + req.user_ns_id = owner_id; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_TRUE(found_netns); + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); + + /* Close socket - namespace should become inactive and disappear from listns() */ + close(sock_fd); + close(netns_fd); + + /* Verify it's no longer in listns() output */ + req.user_ns_id = 0; + found_netns = false; + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) { + found_netns = true; + break; + } + } + + ASSERT_FALSE(found_netns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); +} + +/* + * Test that socket-kept netns can be reopened via file handle. + * Verify that a network namespace kept alive by a socket FD can be + * reopened using file handles even after the creating process exits. + */ +TEST(siocgskns_file_handle) +{ + int sock_fd, netns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st1, st2; + ino_t netns_ino; + __u64 netns_id; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int ret; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new netns and socket */ + close(ipc_sockets[0]); + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st1), 0); + netns_ino = st1.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ + + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + close(reopened_fd); + + /* Close the netns FD */ + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Reopen namespace using file handle (while socket still keeps it alive) */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + /* Verify it's the same namespace */ + ASSERT_EQ(fstat(reopened_fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); + + /* Close socket - namespace should become inactive */ + close(sock_fd); + free(handle); +} + +/* + * Test combined listns() and file handle operations with socket-kept netns. + * Create a netns, keep it alive with a socket, verify it appears in listns(), + * then reopen it via file handle obtained from listns() entry. + */ +TEST(siocgskns_listns_and_file_handle) +{ + int sock_fd, netns_fd, userns_fd, reopened_fd; + int ipc_sockets[2]; + pid_t pid; + int status; + struct stat st; + ino_t netns_ino; + __u64 netns_id, userns_id; + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + bool found_netns = false, found_userns = false; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + + /* Allocate file_handle structure for nsfs */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create new userns and netns with socket */ + close(ipc_sockets[0]); + + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Send socket FD to parent via SCM_RIGHTS */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(sock_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent: receive socket FD */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + ASSERT_EQ(n, 1); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + ASSERT_NE(cmsg, NULL); + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for child to exit */ + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + ASSERT_EQ(fstat(netns_fd, &st), 0); + netns_ino = st.st_ino; + + /* Get namespace ID */ + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace */ + userns_fd = ioctl(netns_fd, NS_GET_USERNS); + if (userns_fd < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_fd, 0); + } + + /* Get owner namespace ID */ + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); + if (ret < 0) { + close(userns_fd); + free(handle); + close(sock_fd); + close(netns_fd); + ASSERT_EQ(ret, 0); + } + close(userns_fd); + + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + /* Construct file handle from namespace ID */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + struct stat reopened_st; + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); + ASSERT_EQ(reopened_st.st_ino, netns_ino); + + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); + + close(reopened_fd); + close(netns_fd); + + /* Try to reopen via file handle - should fail since namespace is now inactive */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + ASSERT_LT(reopened_fd, 0); + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); + + /* Get network namespace from socket */ + netns_fd = ioctl(sock_fd, SIOCGSKNS); + if (netns_fd < 0) { + free(handle); + close(sock_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_fd, 0); + } + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_TRUE(found_netns); + ASSERT_TRUE(found_userns); + TH_LOG("Found netns %llu in listns() output", netns_id); + + close(netns_fd); + + /* Verify namespace appears in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_fd); + close(netns_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + TH_LOG("listns failed: %s", strerror(errno)); + ASSERT_GE(ret, 0); + } + + found_netns = false; + found_userns = false; + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_id) + found_netns = true; + if (ns_ids[i] == userns_id) + found_userns = true; + } + ASSERT_FALSE(found_netns); + ASSERT_FALSE(found_userns); + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); + + close(sock_fd); + free(handle); +} + +/* + * Test multi-level namespace resurrection across three user namespace levels. + * + * This test creates a complex namespace hierarchy with three levels of user + * namespaces and a network namespace at the deepest level. It verifies that + * the resurrection semantics work correctly when SIOCGSKNS is called on a + * socket from an inactive namespace tree, and that listns() and + * open_by_handle_at() correctly respect visibility rules. + * + * Hierarchy after child processes exit (all with 0 active refcount): + * + * net_L3A (0) <- Level 3 network namespace + * | + * + + * userns_L3 (0) <- Level 3 user namespace + * | + * + + * userns_L2 (0) <- Level 2 user namespace + * | + * + + * userns_L1 (0) <- Level 1 user namespace + * | + * x + * init_user_ns + * + * The test verifies: + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain + * 2. After resurrection, all namespaces are visible in listns() + * 3. Resurrected namespaces can be reopened via file handles + * 4. Closing the netns FD cascades down: the entire ownership chain + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again + * 5. Inactive namespaces disappear from listns() and cannot be reopened + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again + * 7. After second resurrection, namespaces are visible and can be reopened + */ +TEST(siocgskns_multilevel_resurrection) +{ + int ipc_sockets[2]; + pid_t pid_l1, pid_l2, pid_l3; + int status; + + /* Namespace file descriptors to be received from child */ + int sock_L3A_fd = -1; + int netns_L3A_fd = -1; + __u64 netns_L3A_id; + __u64 userns_L1_id, userns_L2_id, userns_L3_id; + + /* For listns() and file handle testing */ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids[256]; + int ret, i; + struct file_handle *handle; + struct nsfs_file_handle *nsfs_fh; + int reopened_fd; + + /* Allocate file handle for testing */ + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); + ASSERT_NE(handle, NULL); + handle->handle_bytes = sizeof(struct nsfs_file_handle); + handle->handle_type = FILEID_NSFS; + + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); + + /* + * Fork level 1 child that creates userns_L1 + */ + pid_l1 = fork(); + ASSERT_GE(pid_l1, 0); + + if (pid_l1 == 0) { + /* Level 1 child */ + int ipc_L2[2]; + close(ipc_sockets[0]); + + /* Create userns_L1 */ + if (setup_userns() < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* Create socketpair for communicating with L2 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { + close(ipc_sockets[1]); + exit(1); + } + + /* + * Fork level 2 child that creates userns_L2 + */ + pid_l2 = fork(); + if (pid_l2 < 0) { + close(ipc_sockets[1]); + close(ipc_L2[0]); + close(ipc_L2[1]); + exit(1); + } + + if (pid_l2 == 0) { + /* Level 2 child */ + int ipc_L3[2]; + close(ipc_L2[0]); + + /* Create userns_L2 (nested inside userns_L1) */ + if (setup_userns() < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* Create socketpair for communicating with L3 child */ + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { + close(ipc_L2[1]); + exit(1); + } + + /* + * Fork level 3 child that creates userns_L3 and network namespaces + */ + pid_l3 = fork(); + if (pid_l3 < 0) { + close(ipc_L2[1]); + close(ipc_L3[0]); + close(ipc_L3[1]); + exit(1); + } + + if (pid_l3 == 0) { + /* Level 3 child - the deepest level */ + int sock_fd; + close(ipc_L3[0]); + + /* Create userns_L3 (nested inside userns_L2) */ + if (setup_userns() < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create network namespace at level 3 */ + if (unshare(CLONE_NEWNET) < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Create socket in net_L3A */ + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); + if (sock_fd < 0) { + close(ipc_L3[1]); + exit(1); + } + + /* Send socket FD to L2 parent */ + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1] = {'X'}; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); + + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { + close(sock_fd); + close(ipc_L3[1]); + exit(1); + } + + close(sock_fd); + close(ipc_L3[1]); + exit(0); + } + + /* Level 2 child - receive from L3 and forward to L1 */ + close(ipc_L3[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); + close(ipc_L3[0]); + + if (n != 1) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_L2[1]); + waitpid(pid_l3, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L3 child */ + waitpid(pid_l3, NULL, 0); + + /* Forward the socket FD to L1 parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Y'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_L2[1]); + exit(1); + } + + close(received_fd); + close(ipc_L2[1]); + exit(0); + } + + /* Level 1 child - receive from L2 and forward to parent */ + close(ipc_L2[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + int received_fd; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); + close(ipc_L2[0]); + + if (n != 1) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + close(ipc_sockets[1]); + waitpid(pid_l2, NULL, 0); + exit(1); + } + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L2 child */ + waitpid(pid_l2, NULL, 0); + + /* Forward the socket FD to parent */ + memset(&msg, 0, sizeof(msg)); + buf[0] = 'Z'; + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); + + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { + close(received_fd); + close(ipc_sockets[1]); + exit(1); + } + + close(received_fd); + close(ipc_sockets[1]); + exit(0); + } + + /* Parent - receive the socket from the deepest level */ + close(ipc_sockets[1]); + + struct msghdr msg = {0}; + struct iovec iov = {0}; + char buf[1]; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + + iov.iov_base = buf; + iov.iov_len = 1; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); + close(ipc_sockets[0]); + + if (n != 1) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + free(handle); + waitpid(pid_l1, NULL, 0); + SKIP(return, "Failed to receive socket from child"); + } + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); + + /* Wait for L1 child */ + waitpid(pid_l1, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + /* + * At this point, all child processes have exited. The socket itself + * doesn't keep the namespace active - we need to call SIOCGSKNS which + * will resurrect the entire namespace tree by taking active references. + */ + + /* Get network namespace from socket - this resurrects the tree */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + if (netns_L3A_fd < 0) { + free(handle); + close(sock_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "SIOCGSKNS not supported"); + ASSERT_GE(netns_L3A_fd, 0); + } + + /* Get namespace ID for net_L3A */ + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_ID not supported"); + ASSERT_EQ(ret, 0); + } + + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); + if (userns_L3_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOTTY || errno == EINVAL) + SKIP(return, "NS_GET_USERNS not supported"); + ASSERT_GE(userns_L3_fd, 0); + } + + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); + ASSERT_EQ(ret, 0); + + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); + ASSERT_GE(userns_L2_fd, 0); + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); + ASSERT_EQ(ret, 0); + + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); + ASSERT_GE(userns_L1_fd, 0); + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); + ASSERT_EQ(ret, 0); + + close(userns_L1_fd); + close(userns_L2_fd); + close(userns_L3_fd); + + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); + + /* + * Test 1: Verify net_L3A is visible in listns() after resurrection. + * The entire ownership chain should be resurrected and visible. + */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + if (ret < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret, 0); + } + + bool found_netns_L3A = false; + bool found_userns_L1 = false; + bool found_userns_L2 = false; + bool found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); + + /* + * Test 2: Verify net_L3A can be reopened via file handle. + */ + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; + nsfs_fh->ns_id = netns_L3A_id; + nsfs_fh->ns_type = 0; + nsfs_fh->ns_inum = 0; + + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened"); + + /* + * Test 3: Verify that when we close the netns FD (dropping the last + * active reference), the entire tree becomes inactive and disappears + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> + * userns_L2 drops -> userns_L1 drops. + */ + close(netns_L3A_fd); + + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_FALSE(found_netns_L3A); + ASSERT_FALSE(found_userns_L1); + ASSERT_FALSE(found_userns_L2); + ASSERT_FALSE(found_userns_L3); + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); + + /* + * Test 4: Verify file handle no longer works for inactive namespace. + */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd >= 0) { + close(reopened_fd); + free(handle); + ASSERT_TRUE(false); /* Should have failed */ + } + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); + + /* + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect + * the namespace tree once more. + */ + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); + ASSERT_GE(netns_L3A_fd, 0); + + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); + + /* Verify the namespace tree is resurrected and visible in listns() */ + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); + ASSERT_GE(ret, 0); + + found_netns_L3A = false; + found_userns_L1 = false; + found_userns_L2 = false; + found_userns_L3 = false; + + for (i = 0; i < ret; i++) { + if (ns_ids[i] == netns_L3A_id) + found_netns_L3A = true; + if (ns_ids[i] == userns_L1_id) + found_userns_L1 = true; + if (ns_ids[i] == userns_L2_id) + found_userns_L2 = true; + if (ns_ids[i] == userns_L3_id) + found_userns_L3 = true; + } + + ASSERT_TRUE(found_netns_L3A); + ASSERT_TRUE(found_userns_L1); + ASSERT_TRUE(found_userns_L2); + ASSERT_TRUE(found_userns_L3); + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); + + /* Verify we can reopen via file handle again */ + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (reopened_fd < 0) { + free(handle); + close(sock_L3A_fd); + close(netns_L3A_fd); + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); + ASSERT_GE(reopened_fd, 0); + } + + close(reopened_fd); + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); + + /* Final cleanup */ + close(sock_L3A_fd); + close(netns_L3A_fd); + free(handle); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c new file mode 100644 index 000000000000..dd7df7d6cb27 --- /dev/null +++ b/tools/testing/selftests/namespaces/stress_test.c @@ -0,0 +1,626 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <linux/nsfs.h> +#include "../kselftest_harness.h" +#include "../filesystems/utils.h" +#include "wrappers.h" + +/* + * Stress tests for namespace active reference counting. + * + * These tests validate that the active reference counting system can handle + * high load scenarios including rapid namespace creation/destruction, large + * numbers of concurrent namespaces, and various edge cases under stress. + */ + +/* + * Test rapid creation and destruction of user namespaces. + * Create and destroy namespaces in quick succession to stress the + * active reference tracking and ensure no leaks occur. + */ +TEST(rapid_namespace_creation_destruction) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[256], ns_ids_after[256]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count of active user namespaces */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Rapidly create and destroy 100 user namespaces */ + for (i = 0; i < 100; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create user namespace and immediately exit */ + if (setup_userns() < 0) + exit(1); + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline (no leaked namespaces) */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test creating many concurrent namespaces. + * Verify that listns() correctly tracks all of them and that they all + * become inactive after processes exit. + */ +TEST(many_concurrent_namespaces) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; + ssize_t ret_before, ret_during, ret_after; + pid_t pids[50]; + int num_children = 50; + int i; + int sv[2]; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children, each with their own user namespace */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Child: create user namespace and wait for parent signal */ + char c; + + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* List namespaces while all children are running */ + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); + ASSERT_GE(ret_during, 0); + + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); + + /* Should have at least num_children more namespaces than baseline */ + ASSERT_GE(ret_during, ret_before + num_children); + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + /* If we fail to write, kill remaining children */ + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After all children exit: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test rapid namespace creation with different namespace types. + * Create multiple types of namespaces rapidly to stress the tracking system. + */ +TEST(rapid_mixed_namespace_creation) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, /* All types */ + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline count */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); + + /* Rapidly create and destroy namespaces with multiple types */ + for (i = 0; i < 50; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + + /* Create additional namespace types */ + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + if (unshare(CLONE_NEWIPC) < 0) + exit(1); + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test nested namespace creation under stress. + * Create deeply nested namespace hierarchies and verify proper cleanup. + */ +TEST(nested_namespace_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active user namespaces", ret_before); + + /* Create 20 processes, each with nested user namespaces */ + for (i = 0; i < 20; i++) { + pid_t pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int userns_fd; + uid_t orig_uid = getuid(); + int depth; + + /* Create nested user namespaces (up to 5 levels) */ + for (depth = 0; depth < 5; depth++) { + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); + if (userns_fd < 0) + exit(1); + + if (setns(userns_fd, CLONE_NEWUSER) < 0) { + close(userns_fd); + exit(1); + } + close(userns_fd); + } + + exit(0); + } + + /* Parent: wait for child */ + int status; + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test listns() pagination under stress. + * Create many namespaces and verify pagination works correctly. + */ +TEST(listns_pagination_stress) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER, + .spare2 = 0, + .user_ns_id = 0, + }; + pid_t pids[30]; + int num_children = 30; + int i; + int sv[2]; + __u64 all_ns_ids[512]; + int total_found = 0; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); + + /* Create many children with user namespaces */ + for (i = 0; i < num_children; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + char c; + close(sv[0]); + + if (setup_userns() < 0) { + close(sv[1]); + exit(1); + } + + /* Signal parent we're ready */ + if (write(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + /* Wait for parent signal to exit */ + if (read(sv[1], &c, 1) != 1) { + close(sv[1]); + exit(1); + } + + close(sv[1]); + exit(0); + } + } + + close(sv[1]); + + /* Wait for all children to signal ready */ + for (i = 0; i < num_children; i++) { + char c; + if (read(sv[0], &c, 1) != 1) { + /* If we fail to read, kill all children and exit */ + close(sv[0]); + for (int j = 0; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + /* Paginate through all namespaces using small batch sizes */ + req.ns_id = 0; + while (1) { + __u64 batch[5]; /* Small batch size to force pagination */ + ssize_t ret; + + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); + if (ret < 0) { + if (errno == ENOSYS) { + close(sv[0]); + for (i = 0; i < num_children; i++) + kill(pids[i], SIGKILL); + for (i = 0; i < num_children; i++) + waitpid(pids[i], NULL, 0); + SKIP(return, "listns() not supported"); + } + ASSERT_GE(ret, 0); + } + + if (ret == 0) + break; + + /* Store results */ + for (i = 0; i < ret && total_found < 512; i++) { + all_ns_ids[total_found++] = batch[i]; + } + + /* Update cursor for next batch */ + if (ret == ARRAY_SIZE(batch)) + req.ns_id = batch[ret - 1]; + else + break; + } + + TH_LOG("Paginated through %d user namespaces", total_found); + + /* Verify no duplicates in pagination */ + for (i = 0; i < total_found; i++) { + for (int j = i + 1; j < total_found; j++) { + if (all_ns_ids[i] == all_ns_ids[j]) { + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", + (unsigned long long)all_ns_ids[i], i, j); + ASSERT_TRUE(false); + } + } + } + + /* Signal all children to exit */ + for (i = 0; i < num_children; i++) { + char c = 'X'; + if (write(sv[0], &c, 1) != 1) { + close(sv[0]); + for (int j = i; j < num_children; j++) + kill(pids[j], SIGKILL); + for (int j = 0; j < num_children; j++) + waitpid(pids[j], NULL, 0); + ASSERT_TRUE(false); + } + } + + close(sv[0]); + + /* Wait for all children */ + for (i = 0; i < num_children; i++) { + int status; + waitpid(pids[i], &status, 0); + } +} + +/* + * Test concurrent namespace operations. + * Multiple processes creating, querying, and destroying namespaces concurrently. + */ +TEST(concurrent_namespace_operations) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = 0, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + pid_t pids[20]; + int num_workers = 20; + int i; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Create worker processes that do concurrent operations */ + for (i = 0; i < num_workers; i++) { + pids[i] = fork(); + ASSERT_GE(pids[i], 0); + + if (pids[i] == 0) { + /* Each worker: create namespaces, list them, repeat */ + int iterations; + + for (iterations = 0; iterations < 10; iterations++) { + int userns_fd; + __u64 temp_ns_ids[100]; + ssize_t ret; + + /* Create a user namespace */ + userns_fd = get_userns_fd(0, getuid(), 1); + if (userns_fd < 0) + continue; + + /* List namespaces */ + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); + (void)ret; + + close(userns_fd); + + /* Small delay */ + usleep(1000); + } + + exit(0); + } + } + + /* Wait for all workers */ + for (i = 0; i < num_workers; i++) { + int status; + waitpid(pids[i], &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +/* + * Test namespace churn - continuous creation and destruction. + * Simulates high-churn scenarios like container orchestration. + */ +TEST(namespace_churn) +{ + struct ns_id_req req = { + .size = sizeof(req), + .spare = 0, + .ns_id = 0, + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, + .spare2 = 0, + .user_ns_id = 0, + }; + __u64 ns_ids_before[512], ns_ids_after[512]; + ssize_t ret_before, ret_after; + int cycle; + + /* Get baseline */ + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); + if (ret_before < 0) { + if (errno == ENOSYS) + SKIP(return, "listns() not supported"); + ASSERT_GE(ret_before, 0); + } + + TH_LOG("Baseline: %zd active namespaces", ret_before); + + /* Simulate churn: batches of namespaces created and destroyed */ + for (cycle = 0; cycle < 10; cycle++) { + pid_t batch_pids[10]; + int i; + + /* Create batch */ + for (i = 0; i < 10; i++) { + batch_pids[i] = fork(); + ASSERT_GE(batch_pids[i], 0); + + if (batch_pids[i] == 0) { + /* Create multiple namespace types */ + if (setup_userns() < 0) + exit(1); + if (unshare(CLONE_NEWNET) < 0) + exit(1); + if (unshare(CLONE_NEWUTS) < 0) + exit(1); + + /* Keep namespaces alive briefly */ + usleep(10000); + exit(0); + } + } + + /* Wait for batch to complete */ + for (i = 0; i < 10; i++) { + int status; + waitpid(batch_pids[i], &status, 0); + } + } + + /* Verify we're back to baseline */ + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); + ASSERT_GE(ret_after, 0); + + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); + ASSERT_EQ(ret_before, ret_after); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h new file mode 100644 index 000000000000..9741a64a5b1d --- /dev/null +++ b/tools/testing/selftests/namespaces/wrappers.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/nsfs.h> +#include <linux/types.h> +#include <sys/syscall.h> +#include <unistd.h> + +#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ +#define __SELFTESTS_NAMESPACES_WRAPPERS_H__ + +#ifndef __NR_listns + #if defined __alpha__ + #define __NR_listns 580 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_listns 4470 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_listns 6470 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_listns 5470 + #endif + #else + #define __NR_listns 470 + #endif +#endif + +static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, + size_t nr_ns_ids, unsigned int flags) +{ + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); +} + +#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */ diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 439101b518ee..8f9850a71f54 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -45,6 +45,7 @@ skf_net_off socket so_incoming_cpu so_netns_cookie +so_peek_off so_txtime so_rcv_listener stress_reuseport_listen diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile index de805cbbdf69..528d14c598bb 100644 --- a/tools/testing/selftests/net/af_unix/Makefile +++ b/tools/testing/selftests/net/af_unix/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := \ scm_inq \ scm_pidfd \ scm_rights \ + so_peek_off \ unix_connect \ # end of TEST_GEN_PROGS diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c new file mode 100644 index 000000000000..1a77728128e5 --- /dev/null +++ b/tools/testing/selftests/net/af_unix/so_peek_off.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2025 Google LLC */ + +#include <stdlib.h> +#include <unistd.h> + +#include <sys/socket.h> + +#include "../../kselftest_harness.h" + +FIXTURE(so_peek_off) +{ + int fd[2]; /* 0: sender, 1: receiver */ +}; + +FIXTURE_VARIANT(so_peek_off) +{ + int type; +}; + +FIXTURE_VARIANT_ADD(so_peek_off, stream) +{ + .type = SOCK_STREAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, dgram) +{ + .type = SOCK_DGRAM, +}; + +FIXTURE_VARIANT_ADD(so_peek_off, seqpacket) +{ + .type = SOCK_SEQPACKET, +}; + +FIXTURE_SETUP(so_peek_off) +{ + struct timeval timeout = { + .tv_sec = 0, + .tv_usec = 3000, + }; + int ret; + + ret = socketpair(AF_UNIX, variant->type, 0, self->fd); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW, + &timeout, sizeof(timeout)); + ASSERT_EQ(0, ret); + + ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF, + &(int){0}, sizeof(int)); + ASSERT_EQ(0, ret); +} + +FIXTURE_TEARDOWN(so_peek_off) +{ + close_range(self->fd[0], self->fd[1], 0); +} + +#define sendeq(fd, str, flags) \ + do { \ + int bytes, len = strlen(str); \ + \ + bytes = send(fd, str, len, flags); \ + ASSERT_EQ(len, bytes); \ + } while (0) + +#define recveq(fd, str, buflen, flags) \ + do { \ + char buf[(buflen) + 1] = {}; \ + int bytes; \ + \ + bytes = recv(fd, buf, buflen, flags); \ + ASSERT_NE(-1, bytes); \ + ASSERT_STREQ(str, buf); \ + } while (0) + +#define async \ + for (pid_t pid = (pid = fork(), \ + pid < 0 ? \ + __TH_LOG("Failed to start async {}"), \ + _metadata->exit_code = KSFT_FAIL, \ + __bail(1, _metadata), \ + 0xdead : \ + pid); \ + !pid; exit(0)) + +TEST_F(so_peek_off, single_chunk) +{ + sendeq(self->fd[0], "aaaabbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks) +{ + sendeq(self->fd[0], "aaaa", 0); + sendeq(self->fd[0], "bbbb", 0); + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aaaa", 4, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* goto again; -> goto redo; in unix_stream_read_generic(). */ + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_F(so_peek_off, two_chunks_overlap) +{ + sendeq(self->fd[0], "aaaa", 0); + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + sendeq(self->fd[0], "bbbb", 0); + + if (variant->type == SOCK_STREAM) { + /* SOCK_STREAM tries to fill the buffer. */ + recveq(self->fd[1], "aabb", 4, MSG_PEEK); + recveq(self->fd[1], "bb", 100, MSG_PEEK); + } else { + /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); + } +} + +TEST_F(so_peek_off, two_chunks_overlap_blocking) +{ + async { + usleep(1000); + sendeq(self->fd[0], "aaaa", 0); + } + + recveq(self->fd[1], "aa", 2, MSG_PEEK); + + async { + usleep(1000); + sendeq(self->fd[0], "bbbb", 0); + } + + /* Even SOCK_STREAM does not wait if at least one byte is read. */ + recveq(self->fd[1], "aa", 100, MSG_PEEK); + + recveq(self->fd[1], "bbbb", 100, MSG_PEEK); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh index ff2accccaf4d..b4eda6c6199e 100755 --- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh +++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh @@ -30,6 +30,11 @@ tfail() do_test "tfail" false } +tfail2() +{ + do_test "tfail2" false +} + txfail() { FAIL_TO_XFAIL=yes do_test "txfail" false @@ -132,6 +137,8 @@ test_ret() ret_subtest $ksft_fail "tfail" txfail tfail ret_subtest $ksft_xfail "txfail" txfail txfail + + ret_subtest $ksft_fail "tfail2" tfail2 tfail } exit_status_tests_run() diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh index ecd34f364125..892895659c7e 100755 --- a/tools/testing/selftests/net/forwarding/local_termination.sh +++ b/tools/testing/selftests/net/forwarding/local_termination.sh @@ -176,6 +176,8 @@ run_test() local rcv_dmac=$(mac_get $rcv_if_name) local should_receive + setup_wait + tcpdump_start $rcv_if_name mc_route_prepare $send_if_name diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 2b1d9f2b3e9e..cfc39f70635d 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -754,11 +754,11 @@ static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE]; create_packet(buf, 0, 0, PAYLOAD_LEN, 0); - add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data1); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data1); write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0); - add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS, ext_data2); + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_DSTOPTS, ext_data2); write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr); } @@ -989,6 +989,7 @@ static void check_recv_pkts(int fd, int *correct_payload, static void gro_sender(void) { + const int fin_delay_us = 100 * 1000; static char fin_pkt[MAX_HDR_LEN]; struct sockaddr_ll daddr = {}; int txfd = -1; @@ -1032,15 +1033,22 @@ static void gro_sender(void) write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else if (strcmp(testname, "tcp") == 0) { send_changed_checksum(txfd, &daddr); + /* Adding sleep before sending FIN so that it is not + * received prior to other packets. + */ + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_changed_seq(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_changed_ts(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); send_diff_opt(txfd, &daddr); + usleep(fin_delay_us); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); } else if (strcmp(testname, "ip") == 0) { send_changed_ECN(txfd, &daddr); diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index feba4ef69a54..f448bafb3f20 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -43,7 +43,7 @@ __ksft_status_merge() weights[$i]=$((weight++)) done - if [[ ${weights[$a]} > ${weights[$b]} ]]; then + if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then echo "$a" return 0 else diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index b148cadb96d0..fc7e22b503d3 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len); if (bw < 0) { - if (cfg_rcv_trunc) - return 0; + /* expected reset, continue to read */ + if (cfg_rcv_trunc && + (errno == ECONNRESET || + errno == EPIPE)) { + fds.events &= ~POLLOUT; + continue; + } + perror("write"); return 111; } @@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, } if (fds.revents & (POLLERR | POLLNVAL)) { - if (cfg_rcv_trunc) - return 0; + if (cfg_rcv_trunc) { + fds.events &= ~(POLLERR | POLLNVAL); + continue; + } fprintf(stderr, "Unexpected revents: " "POLLERR/POLLNVAL(%x)\n", fds.revents); return 5; @@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv) */ if (cfg_truncate < 0) { cfg_rcv_trunc = true; - signal(SIGPIPE, handle_signal); + signal(SIGPIPE, SIG_IGN); } break; case 'j': diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 47ecb5b3836e..9b7b93f8eb0c 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -492,7 +492,7 @@ do_transfer() "than expected (${expect_synrx})" retc=1 fi - if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then + if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then if [ ${stat_ooo_now} -eq 0 ]; then mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \ "than expected (${expect_ackrx})" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 78a1aa4ecff2..43f31f8d587f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2532,7 +2532,7 @@ remove_tests() if reset "remove single subflow"; then pm_nl_set_limits $ns1 0 1 pm_nl_set_limits $ns2 0 1 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 1 1 1 @@ -2545,8 +2545,8 @@ remove_tests() if reset "remove multiple subflows"; then pm_nl_set_limits $ns1 0 2 pm_nl_set_limits $ns2 0 2 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns2=-2 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2557,7 +2557,7 @@ remove_tests() # single address, remove if reset "remove single address"; then pm_nl_set_limits $ns1 0 1 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 1 addr_nr_ns1=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2570,9 +2570,9 @@ remove_tests() # subflow and signal, remove if reset "remove subflow and signal"; then pm_nl_set_limits $ns1 0 2 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 2 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 2 2 2 @@ -2584,10 +2584,10 @@ remove_tests() # subflows and signal, remove if reset "remove subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2599,9 +2599,9 @@ remove_tests() # addresses remove if reset "remove addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2614,10 +2614,10 @@ remove_tests() # invalid addresses remove if reset "remove invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup # broadcast IP: no packet for this address will be received on ns1 - pm_nl_add_endpoint $ns1 224.0.0.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal + pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup pm_nl_set_limits $ns2 2 2 addr_nr_ns1=-3 speed=10 \ run_tests $ns1 $ns2 10.0.1.1 @@ -2631,10 +2631,10 @@ remove_tests() # subflows and signal, flush if reset "flush subflows and signal"; then pm_nl_set_limits $ns1 0 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup pm_nl_set_limits $ns2 1 3 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2647,9 +2647,9 @@ remove_tests() if reset "flush subflows"; then pm_nl_set_limits $ns1 3 3 pm_nl_set_limits $ns2 3 3 - pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150 - pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow - pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow + pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150 + pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup + pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 3 3 3 @@ -2666,9 +2666,9 @@ remove_tests() # addresses flush if reset "flush addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250 - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.4.1 flags signal + pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250 + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2681,9 +2681,9 @@ remove_tests() # invalid addresses flush if reset "flush invalid addresses"; then pm_nl_set_limits $ns1 3 3 - pm_nl_add_endpoint $ns1 10.0.12.1 flags signal - pm_nl_add_endpoint $ns1 10.0.3.1 flags signal - pm_nl_add_endpoint $ns1 10.0.14.1 flags signal + pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup + pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup pm_nl_set_limits $ns2 3 3 addr_nr_ns1=-8 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -3500,7 +3500,6 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3509,7 +3508,6 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then - MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 join_rst_nr=1 \ @@ -3806,7 +3804,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 2 2 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3831,7 +3829,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create destroy subflow @@ -3839,7 +3837,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3859,7 +3857,7 @@ userspace_tests() chk_mptcp_info subflows 0 subflows 0 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm create id 0 subflow @@ -3867,7 +3865,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3880,7 +3878,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 2 2 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm remove initial subflow @@ -3888,7 +3886,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns2 pm_nl_set_limits $ns1 0 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns2 @@ -3904,7 +3902,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi # userspace pm send RM_ADDR for ID 0 @@ -3912,7 +3910,7 @@ userspace_tests() continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then set_userspace_pm $ns1 pm_nl_set_limits $ns2 1 1 - { speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! wait_mpj $ns1 @@ -3930,7 +3928,7 @@ userspace_tests() chk_mptcp_info subflows 1 subflows 1 chk_subflows_total 1 1 kill_events_pids - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi } @@ -3943,7 +3941,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - { speed=slow \ + { timeout_test=120 test_linkfail=128 speed=slow \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -3960,7 +3958,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid fi if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT && @@ -3970,7 +3968,7 @@ endpoint_tests() pm_nl_set_limits $ns2 0 3 pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4015,7 +4013,7 @@ endpoint_tests() chk_mptcp_info subflows 3 subflows 3 done - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4048,7 +4046,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal - { test_linkfail=4 speed=5 \ + { timeout_test=120 test_linkfail=128 speed=5 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4057,39 +4055,46 @@ endpoint_tests() $ns1 10.0.2.1 id 1 flags signal chk_subflow_nr "before delete" 2 chk_mptcp_info subflows 1 subflows 1 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 1 pm_nl_del_endpoint $ns1 1 10.0.2.1 pm_nl_del_endpoint $ns1 2 224.0.0.1 sleep 0.5 chk_subflow_nr "after delete" 1 chk_mptcp_info subflows 0 subflows 0 + chk_mptcp_info add_addr_signal 0 add_addr_accepted 0 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add" 3 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_del_endpoint $ns1 42 10.0.1.1 sleep 0.5 chk_subflow_nr "after delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal wait_mpj $ns2 chk_subflow_nr "after re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 pm_nl_del_endpoint $ns1 99 10.0.1.1 sleep 0.5 chk_subflow_nr "after re-delete ID 0" 2 chk_mptcp_info subflows 2 subflows 2 + chk_mptcp_info add_addr_signal 2 add_addr_accepted 2 pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal wait_mpj $ns2 chk_subflow_nr "after re-re-add ID 0" 3 chk_mptcp_info subflows 3 subflows 3 - mptcp_lib_kill_wait $tests_pid + chk_mptcp_info add_addr_signal 3 add_addr_accepted 2 + mptcp_lib_kill_group_wait $tests_pid kill_events_pids chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1 @@ -4121,7 +4126,7 @@ endpoint_tests() # broadcast IP: no packet for this address will be received on ns1 pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow - { test_linkfail=4 speed=20 \ + { timeout_test=120 test_linkfail=128 speed=20 \ run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null local tests_pid=$! @@ -4137,7 +4142,7 @@ endpoint_tests() wait_mpj $ns2 pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal wait_mpj $ns2 - mptcp_lib_kill_wait $tests_pid + mptcp_lib_kill_group_wait $tests_pid join_syn_tx=3 join_connect_err=1 \ chk_join_nr 2 2 2 diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d62e653d48b0..f4388900016a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -350,6 +350,27 @@ mptcp_lib_kill_wait() { wait "${1}" 2>/dev/null } +# $1: PID +mptcp_lib_pid_list_children() { + local curr="${1}" + # evoke 'ps' only once + local pids="${2:-"$(ps o pid,ppid)"}" + + echo "${curr}" + + local pid + for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do + mptcp_lib_pid_list_children "${pid}" "${pids}" + done +} + +# $1: PID +mptcp_lib_kill_group_wait() { + # Some users might not have procps-ng: cannot use "kill -- -PID" + mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null + wait "${1}" 2>/dev/null +} + # $1: IP address mptcp_lib_is_v6() { [ -z "${1##*:*}" ] diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index f87993def738..d60f10a873bb 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -148,6 +148,14 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) #endif +#ifndef PIDFD_INFO_SUPPORTED_MASK +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) +#endif + +#ifndef PIDFD_INFO_COREDUMP_SIGNAL +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) +#endif + #ifndef PIDFD_COREDUMPED #define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */ #endif @@ -183,8 +191,11 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct { + __u32 coredump_mask; + __u32 coredump_signal; + }; + __u64 supported_mask; }; /* diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c index a0eb6e81eaa2..cb5430a2fd75 100644 --- a/tools/testing/selftests/pidfd/pidfd_info_test.c +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread) EXPECT_EQ(close(pidfd_thread), 0); } +/* + * Test: PIDFD_INFO_SUPPORTED_MASK field + * + * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel + * returns the supported_mask field indicating which flags the kernel supports. + */ +TEST(supported_mask_field) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + /* Request supported_mask field */ + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + + /* Verify supported_mask contains expected flags */ + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL)); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + +/* + * Test: PIDFD_INFO_SUPPORTED_MASK always available + * + * Verify that supported_mask is returned even when other fields are requested. + */ +TEST(supported_mask_with_other_fields) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK, + }; + int pidfd; + pid_t pid; + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + + if (pid == 0) + pause(); + + ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0); + + /* Both fields should be present */ + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK)); + ASSERT_NE(info.supported_mask, 0); + + /* Clean up */ + sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0); + sys_waitid(P_PIDFD, pidfd, NULL, WEXITED); + close(pidfd); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 998e5a2f4579..0091bcd91c2c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -961,5 +961,49 @@ "teardown": [ "$TC qdisc del dev $DUMMY root" ] + }, + { + "id": "4989", + "name": "Try to add an fq child to an ingress qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 ingress" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY ingress" + ] + }, + { + "id": "c2b0", + "name": "Try to add an fq child to a clsact qdisc", + "category": [ + "qdisc", + "ingress" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DUMMY handle ffff:0 clsact" + ], + "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq", + "expExitCode": "2", + "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:", + "matchJSON": [], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] } ] diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..10badae13ebe 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns) return 0; } +static void dummy_event_handler(int val) +{ + /* No action needed */ +} + +static int nanosleep_test_remaining(int clockid) +{ + struct timespec rqtp = {}, rmtp = {}; + struct itimerspec itimer = {}; + struct sigaction sa = {}; + timer_t timer; + int ret; + + sa.sa_handler = dummy_event_handler; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + ret = timer_create(clockid, NULL, &timer); + if (ret) + return -1; + + itimer.it_value.tv_nsec = NSEC_PER_SEC / 4; + ret = timer_settime(timer, 0, &itimer, NULL); + if (ret) + return -1; + + rqtp.tv_nsec = NSEC_PER_SEC / 2; + ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp); + if (ret != EINTR) + return -1; + + ret = timer_delete(timer); + if (ret) + return -1; + + sa.sa_handler = SIG_DFL; + ret = sigaction(SIGALRM, &sa, NULL); + if (ret) + return -1; + + if (!in_order((struct timespec) {}, rmtp)) + return -1; + + if (!in_order(rmtp, rqtp)) + return -1; + + return 0; +} + int main(int argc, char **argv) { long long length; @@ -150,6 +200,11 @@ int main(int argc, char **argv) } length *= 100; } + ret = nanosleep_test_remaining(clockid); + if (ret < 0) { + ksft_test_result_fail("%-31s\n", clockstring(clockid)); + ksft_exit_fail(); + } ksft_test_result_pass("%-31s\n", clockstring(clockid)); next: ret = 0; diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..a563c438ac79 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -18,6 +18,7 @@ #include <time.h> #include <include/vdso/time64.h> #include <pthread.h> +#include <stdbool.h> #include "../kselftest.h" @@ -670,8 +671,14 @@ static void check_timer_create_exact(void) int main(int argc, char **argv) { + bool run_sig_ign_tests = ksft_min_kernel_version(6, 13); + ksft_print_header(); - ksft_set_plan(19); + if (run_sig_ign_tests) { + ksft_set_plan(19); + } else { + ksft_set_plan(10); + } ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); @@ -695,15 +702,20 @@ int main(int argc, char **argv) check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_distribution(); - check_sig_ign(0); - check_sig_ign(1); - check_rearm(); - check_delete(); - check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); - check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); - check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + if (run_sig_ign_tests) { + check_sig_ign(0); + check_sig_ign(1); + check_rearm(); + check_delete(); + check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); + check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); + check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); + } else { + ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n"); + } + check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c index 5288e768b207..68625362add2 100644 --- a/tools/testing/selftests/user_events/perf_test.c +++ b/tools/testing/selftests/user_events/perf_test.c @@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) { ASSERT_EQ(1 << reg.enable_bit, self->check); /* Ensure write shows up at correct offset */ - ASSERT_NE(-1, write(self->data_fd, ®.write_index, + ASSERT_NE(-1, write(self->data_fd, (void *)®.write_index, sizeof(reg.write_index))); val = (void *)(((char *)perf_page) + perf_page->data_offset); ASSERT_EQ(PERF_RECORD_SAMPLE, *val); diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 240409bf5f8a..69ec0c856481 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -4,9 +4,12 @@ #include <fcntl.h> #include <string.h> -#include <linux/vfio.h> + +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/list.h> #include <linux/pci_regs.h> +#include <linux/vfio.h> #include "../../../kselftest.h" @@ -185,6 +188,13 @@ struct vfio_pci_device { struct vfio_pci_driver driver; }; +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + /* * Return the BDF string of the device that the test should use. * @@ -206,6 +216,13 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges); + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + int __vfio_pci_dma_map(struct vfio_pci_device *device, struct vfio_dma_region *region); int __vfio_pci_dma_unmap(struct vfio_pci_device *device, diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index a381fd253aa7..b479a359da12 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -12,11 +12,12 @@ #include <sys/mman.h> #include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> +#include <linux/overflow.h> #include <linux/types.h> #include <linux/vfio.h> -#include <linux/iommufd.h> #include "../../../kselftest.h" #include <vfio_util.h> @@ -29,6 +30,249 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(device); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = device->ioas_id, + }; + + ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (device->iommufd) + ranges = iommufd_iova_ranges(device, nranges); + else + ranges = vfio_iommu_iova_ranges(device, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = vfio_pci_iova_ranges(device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) { struct vfio_dma_region *region; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4f1ea79a200c..102603d4407d 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -3,6 +3,8 @@ #include <sys/mman.h> #include <unistd.h> +#include <uapi/linux/types.h> +#include <linux/iommufd.h> #include <linux/limits.h> #include <linux/mman.h> #include <linux/sizes.h> @@ -93,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; }; FIXTURE_VARIANT(vfio_dma_mapping_test) { @@ -117,10 +120,12 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } @@ -142,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) else ASSERT_NE(region.vaddr, MAP_FAILED); - region.iova = (u64)region.vaddr; + region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; vfio_pci_dma_map(self->device, ®ion); @@ -219,7 +224,10 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { struct vfio_dma_region *region = &self->region; + struct iommu_iova_range *ranges; u64 region_size = getpagesize(); + iova_t last_iova; + u32 nranges; /* * Over-allocate mmap by double the size to provide enough backing vaddr @@ -232,8 +240,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - /* One page prior to the end of address space */ - region->iova = ~(iova_t)0 & ~(region_size - 1); + ranges = vfio_pci_iova_ranges(self->device, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + last_iova = ranges[nranges - 1].last; + free(ranges); + + /* One page prior to the last iova */ + region->iova = last_iova & ~(region_size - 1); region->size = region_size; } @@ -276,6 +289,7 @@ TEST_F(vfio_dma_map_limit_test, overflow) struct vfio_dma_region *region = &self->region; int rc; + region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; rc = __vfio_pci_dma_map(self->device, region); diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 2dbd70b7db62..f69eec8b928d 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -19,6 +19,7 @@ static const char *device_bdf; } while (0) static void region_setup(struct vfio_pci_device *device, + struct iova_allocator *iova_allocator, struct vfio_dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; @@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device, VFIO_ASSERT_NE(vaddr, MAP_FAILED); region->vaddr = vaddr; - region->iova = (u64)vaddr; + region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; vfio_pci_dma_map(device, region); @@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device, FIXTURE(vfio_pci_driver_test) { struct vfio_pci_device *device; + struct iova_allocator *iova_allocator; struct vfio_dma_region memcpy_region; void *vaddr; int msi_fd; @@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test) struct vfio_pci_driver *driver; self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iova_allocator = iova_allocator_init(self->device); driver = &self->device->driver; - region_setup(self->device, &self->memcpy_region, SZ_1G); - region_setup(self->device, &driver->region, SZ_2M); + region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ - self->unmapped_iova = 8UL * SZ_1G; + self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); vfio_pci_driver_init(self->device); self->msi_fd = self->device->msi_eventfds[driver->msi]; @@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) region_teardown(self->device, &self->memcpy_region); region_teardown(self->device, &driver->region); + iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); } diff --git a/tools/testing/selftests/vsock/vmtest.sh b/tools/testing/selftests/vsock/vmtest.sh index edacebfc1632..8ceeb8a7894f 100755 --- a/tools/testing/selftests/vsock/vmtest.sh +++ b/tools/testing/selftests/vsock/vmtest.sh @@ -389,9 +389,9 @@ run_test() { local rc host_oops_cnt_before=$(dmesg | grep -c -i 'Oops') - host_warn_cnt_before=$(dmesg --level=warn | wc -l) + host_warn_cnt_before=$(dmesg --level=warn | grep -c -i 'vsock') vm_oops_cnt_before=$(vm_ssh -- dmesg | grep -c -i 'Oops') - vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | wc -l) + vm_warn_cnt_before=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') name=$(echo "${1}" | awk '{ print $1 }') eval test_"${name}" @@ -403,7 +403,7 @@ run_test() { rc=$KSFT_FAIL fi - host_warn_cnt_after=$(dmesg --level=warn | wc -l) + host_warn_cnt_after=$(dmesg --level=warn | grep -c -i 'vsock') if [[ ${host_warn_cnt_after} -gt ${host_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on host" | log_host "${name}" rc=$KSFT_FAIL @@ -415,7 +415,7 @@ run_test() { rc=$KSFT_FAIL fi - vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | wc -l) + vm_warn_cnt_after=$(vm_ssh -- dmesg --level=warn | grep -c -i 'vsock') if [[ ${vm_warn_cnt_after} -gt ${vm_warn_cnt_before} ]]; then echo "FAIL: kernel warning detected on vm" | log_host "${name}" rc=$KSFT_FAIL diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index cf263fe9deaf..ef97916e3873 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1725,7 +1725,7 @@ static void show_usage(void) "-n, --notrace\t\tIf latency is detected, do not print out the content of\n" "\t\t\tthe trace file to standard output\n\n" -"-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n" +"-e, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n" "-r, --random\t\tArbitrarily sleep a certain amount of time, default\n" "\t\t\t%ld ms, before reading the trace file. The\n" diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fbca8c0972da..ffadc5ee8e04 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -623,31 +623,50 @@ err: return r; } -void kvm_gmem_unbind(struct kvm_memory_slot *slot) +static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; + + xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + + /* + * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() + * cannot see this memslot. + */ + WRITE_ONCE(slot->gmem.file, NULL); +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ struct file *file; /* - * Nothing to do if the underlying file was already closed (or is being - * closed right now), kvm_gmem_release() invalidates all bindings. + * Nothing to do if the underlying file was _already_ closed, as + * kvm_gmem_release() invalidates and nullifies all bindings. */ - file = kvm_gmem_get_file(slot); - if (!file) + if (!slot->gmem.file) return; - gmem = file->private_data; - - filemap_invalidate_lock(file->f_mapping); - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + file = kvm_gmem_get_file(slot); /* - * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() - * cannot see this memslot. + * However, if the file is _being_ closed, then the bindings need to be + * removed as kvm_gmem_release() might not run until after the memslot + * is freed. Note, modifying the bindings is safe even though the file + * is dying as kvm_gmem_release() nullifies slot->gmem.file under + * slots_lock, and only puts its reference to KVM after destroying all + * bindings. I.e. reaching this point means kvm_gmem_release() hasn't + * yet destroyed the bindings or freed the gmem_file, and can't do so + * until the caller drops slots_lock. */ - WRITE_ONCE(slot->gmem.file, NULL); + if (!file) { + __kvm_gmem_unbind(slot, slot->gmem.file->private_data); + return; + } + + filemap_invalidate_lock(file->f_mapping); + __kvm_gmem_unbind(slot, file->private_data); filemap_invalidate_unlock(file->f_mapping); fput(file); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4255fcf9c6e5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -49,6 +49,7 @@ #include <linux/lockdep.h> #include <linux/kthread.h> #include <linux/suspend.h> +#include <linux/rseq.h> #include <asm/processor.h> #include <asm/ioctl.h> @@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; + /* + * FIXME: Remove this hack once all KVM architectures + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. + */ + rseq_virt_userspace_exit(); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } |
