From 22571172257a55c443f1a9306e963da4c6187e83 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 18:03:53 +0100 Subject: [PATCH 001/450] dt-bindings: dma: qcom: bam-dma: Add missing required properties num-channels and qcom,num-ees are required when there are no clocks specified in the device tree, because we have no reliable way to read them from the hardware registers if we cannot ensure the BAM hardware is up when the device is being probed. This has often been forgotten when adding new SoC device trees, so make this clear by describing this requirement in the schema. Signed-off-by: Stephan Gerhold Link: https://lore.kernel.org/r/20250212-bam-dma-fixes-v1-7-f560889e65d8@linaro.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml index f2f87f0f545b..6493a6968bb4 100644 --- a/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,bam-dma.yaml @@ -92,8 +92,12 @@ required: anyOf: - required: - qcom,powered-remotely + - num-channels + - qcom,num-ees - required: - qcom,controlled-remotely + - num-channels + - qcom,num-ees - required: - clocks - clock-names From 5068b5254812433e841a40886e695633148d362d Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 18:03:54 +0100 Subject: [PATCH 002/450] dmaengine: qcom: bam_dma: Fix DT error handling for num-channels/ees When we don't have a clock specified in the device tree, we have no way to ensure the BAM is on. This is often the case for remotely-controlled or remotely-powered BAM instances. In this case, we need to read num-channels from the DT to have all the necessary information to complete probing. However, at the moment invalid device trees without clock and without num-channels still continue probing, because the error handling is missing return statements. The driver will then later try to read the number of channels from the registers. This is unsafe, because it relies on boot firmware and lucky timing to succeed. Unfortunately, the lack of proper error handling here has been abused for several Qualcomm SoCs upstream, causing early boot crashes in several situations [1, 2]. Avoid these early crashes by erroring out when any of the required DT properties are missing. Note that this will break some of the existing DTs upstream (mainly BAM instances related to the crypto engine). However, clearly these DTs have never been tested properly, since the error in the kernel log was just ignored. It's safer to disable the crypto engine for these broken DTBs. [1]: https://lore.kernel.org/r/CY01EKQVWE36.B9X5TDXAREPF@fairphone.com/ [2]: https://lore.kernel.org/r/20230626145959.646747-1-krzysztof.kozlowski@linaro.org/ Cc: stable@vger.kernel.org Fixes: 48d163b1aa6e ("dmaengine: qcom: bam_dma: get num-channels and num-ees from dt") Signed-off-by: Stephan Gerhold Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250212-bam-dma-fixes-v1-8-f560889e65d8@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/qcom/bam_dma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index bbc3276992bb..2cf060174795 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -1283,13 +1283,17 @@ static int bam_dma_probe(struct platform_device *pdev) if (!bdev->bamclk) { ret = of_property_read_u32(pdev->dev.of_node, "num-channels", &bdev->num_channels); - if (ret) + if (ret) { dev_err(bdev->dev, "num-channels unspecified in dt\n"); + return ret; + } ret = of_property_read_u32(pdev->dev.of_node, "qcom,num-ees", &bdev->num_ees); - if (ret) + if (ret) { dev_err(bdev->dev, "num-ees unspecified in dt\n"); + return ret; + } } ret = clk_prepare_enable(bdev->bamclk); From 1071d560afb4c245c2076494226df47db5a35708 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 11 Aug 2025 13:17:32 +0200 Subject: [PATCH 003/450] dm-stripe: fix a possible integer overflow There's a possible integer overflow in stripe_io_hints if we have too large chunk size. Test if the overflow happened, and if it did, don't set limits->io_min and limits->io_opt; Signed-off-by: Mikulas Patocka Reviewed-by: John Garry Suggested-by: Dongsheng Yang Cc: stable@vger.kernel.org --- drivers/md/dm-stripe.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 58902091bf79..1461dc740dae 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; limits->chunk_sectors = sc->chunk_size; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { From 942e47ab228c7dd27c2ae043c17e7aab2028082c Mon Sep 17 00:00:00 2001 From: Pengyu Luo Date: Tue, 12 Aug 2025 17:39:56 +0800 Subject: [PATCH 004/450] phy: qualcomm: phy-qcom-eusb2-repeater: fix override properties property "qcom,tune-usb2-preem" is for EUSB2_TUNE_USB2_PREEM property "qcom,tune-usb2-amplitude" is for EUSB2_TUNE_IUSB2 The downstream correspondence is as follows: EUSB2_TUNE_USB2_PREEM: Tx pre-emphasis tuning EUSB2_TUNE_IUSB2: HS trasmit amplitude EUSB2_TUNE_SQUELCH_U: Squelch detection threshold EUSB2_TUNE_HSDISC: HS disconnect threshold EUSB2_TUNE_EUSB_SLEW: slew rate Fixes: 31bc94de7602 ("phy: qualcomm: phy-qcom-eusb2-repeater: Don't zero-out registers") Signed-off-by: Pengyu Luo Reviewed-by: Konrad Dybcio Reviewed-by: Luca Weiss Link: https://lore.kernel.org/r/20250812093957.32235-1-mitltlatltl@gmail.com Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c index e0f2acc8109c..8fcbc312fd61 100644 --- a/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c +++ b/drivers/phy/qualcomm/phy-qcom-eusb2-repeater.c @@ -127,13 +127,13 @@ static int eusb2_repeater_init(struct phy *phy) rptr->cfg->init_tbl[i].value); /* Override registers from devicetree values */ - if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) regmap_write(regmap, base + EUSB2_TUNE_USB2_PREEM, val); if (!of_property_read_u8(np, "qcom,tune-usb2-disc-thres", &val)) regmap_write(regmap, base + EUSB2_TUNE_HSDISC, val); - if (!of_property_read_u8(np, "qcom,tune-usb2-preem", &val)) + if (!of_property_read_u8(np, "qcom,tune-usb2-amplitude", &val)) regmap_write(regmap, base + EUSB2_TUNE_IUSB2, val); /* Wait for status OK */ From 5cfdfc623835d40664f642b75a56d9934f24c5ff Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 6 Aug 2025 15:01:37 -0500 Subject: [PATCH 005/450] dt-bindings: phy: marvell,comphy-cp110: Fix clock and child node constraints In converting marvell,comphy-cp110 to schema, the constraints for clocks on marvell,comphy-a3700 are wrong, the maximum number of child nodes are wrong, and the phy nodes may have a 'connector' child node: phy@18300 (marvell,comphy-a3700): clock-names: False schema does not allow ['xtal'] phy@120000 (marvell,comphy-cp110): 'phy@3', 'phy@4', 'phy@5' do not match any of the regexes: '^phy@[0-2]$', '^pinctrl-[0-9]+$' phy@120000 (marvell,comphy-cp110): phy@2: 'connector' does not match any of the regexes: '^pinctrl-[0-9]+$' Fixes: 50355ac70d4f ("dt-bindings: phy: Convert marvell,comphy-cp110 to DT schema") Signed-off-by: Rob Herring (Arm) Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20250806200138.1366189-1-robh@kernel.org Signed-off-by: Vinod Koul --- .../bindings/phy/marvell,comphy-cp110.yaml | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml index d9501df42886..c35d31642805 100644 --- a/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml +++ b/Documentation/devicetree/bindings/phy/marvell,comphy-cp110.yaml @@ -47,21 +47,19 @@ properties: const: 0 clocks: + minItems: 1 maxItems: 3 - description: Reference clocks for CP110; MG clock, MG Core clock, AXI clock clock-names: - items: - - const: mg_clk - - const: mg_core_clk - - const: axi_clk + minItems: 1 + maxItems: 3 marvell,system-controller: description: Phandle to the Marvell system controller (CP110 only) $ref: /schemas/types.yaml#/definitions/phandle patternProperties: - '^phy@[0-2]$': + '^phy@[0-5]$': description: A COMPHY lane child node type: object additionalProperties: false @@ -69,10 +67,14 @@ patternProperties: properties: reg: description: COMPHY lane number + maximum: 5 '#phy-cells': const: 1 + connector: + type: object + required: - reg - '#phy-cells' @@ -91,13 +93,24 @@ allOf: then: properties: - clocks: false - clock-names: false + clocks: + maxItems: 1 + clock-names: + const: xtal required: - reg-names else: + properties: + clocks: + minItems: 3 + clock-names: + items: + - const: mg_clk + - const: mg_core_clk + - const: axi_clk + required: - marvell,system-controller From bca065733afd1e3a89a02f05ffe14e966cd5f78e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:04 +0200 Subject: [PATCH 006/450] phy: tegra: xusb: fix device and OF node leak at probe Make sure to drop the references taken to the PMC OF node and device by of_parse_phandle() and of_find_device_by_node() during probe. Note the holding a reference to the PMC device does not prevent the PMC regmap from going away (e.g. if the PMC driver is unbound) so there is no need to keep the reference. Fixes: 2d1021487273 ("phy: tegra: xusb: Add wake/sleepwalk for Tegra210") Cc: stable@vger.kernel.org # 5.14 Cc: JC Kuo Signed-off-by: Johan Hovold Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250724131206.2211-2-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb-tegra210.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/phy/tegra/xusb-tegra210.c b/drivers/phy/tegra/xusb-tegra210.c index ebc8a7e21a31..3409924498e9 100644 --- a/drivers/phy/tegra/xusb-tegra210.c +++ b/drivers/phy/tegra/xusb-tegra210.c @@ -3164,18 +3164,22 @@ tegra210_xusb_padctl_probe(struct device *dev, } pdev = of_find_device_by_node(np); + of_node_put(np); if (!pdev) { dev_warn(dev, "PMC device is not available\n"); goto out; } - if (!platform_get_drvdata(pdev)) + if (!platform_get_drvdata(pdev)) { + put_device(&pdev->dev); return ERR_PTR(-EPROBE_DEFER); + } padctl->regmap = dev_get_regmap(&pdev->dev, "usb_sleepwalk"); if (!padctl->regmap) dev_info(dev, "failed to find PMC regmap\n"); + put_device(&pdev->dev); out: return &padctl->base; } From 64961557efa1b98f375c0579779e7eeda1a02c42 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:05 +0200 Subject: [PATCH 007/450] phy: ti: omap-usb2: fix device leak at unbind Make sure to drop the reference to the control device taken by of_find_device_by_node() during probe when the driver is unbound. Fixes: 478b6c7436c2 ("usb: phy: omap-usb2: Don't use omap_get_control_dev()") Cc: stable@vger.kernel.org # 3.13 Cc: Roger Quadros Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250724131206.2211-3-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-omap-usb2.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/ti/phy-omap-usb2.c b/drivers/phy/ti/phy-omap-usb2.c index c1a0ef979142..c444bb2530ca 100644 --- a/drivers/phy/ti/phy-omap-usb2.c +++ b/drivers/phy/ti/phy-omap-usb2.c @@ -363,6 +363,13 @@ static void omap_usb2_init_errata(struct omap_usb *phy) phy->flags |= OMAP_USB2_DISABLE_CHRG_DET; } +static void omap_usb2_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int omap_usb2_probe(struct platform_device *pdev) { struct omap_usb *phy; @@ -373,6 +380,7 @@ static int omap_usb2_probe(struct platform_device *pdev) struct device_node *control_node; struct platform_device *control_pdev; const struct usb_phy_data *phy_data; + int ret; phy_data = device_get_match_data(&pdev->dev); if (!phy_data) @@ -423,6 +431,11 @@ static int omap_usb2_probe(struct platform_device *pdev) return -EINVAL; } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(&pdev->dev, omap_usb2_put_device, + phy->control_dev); + if (ret) + return ret; } else { if (of_property_read_u32_index(node, "syscon-phy-power", 1, From e19bcea99749ce8e8f1d359f68ae03210694ad56 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 24 Jul 2025 15:12:06 +0200 Subject: [PATCH 008/450] phy: ti-pipe3: fix device leak at unbind Make sure to drop the reference to the control device taken by of_find_device_by_node() during probe when the driver is unbound. Fixes: 918ee0d21ba4 ("usb: phy: omap-usb3: Don't use omap_get_control_dev()") Cc: stable@vger.kernel.org # 3.13 Cc: Roger Quadros Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20250724131206.2211-4-johan@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-ti-pipe3.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/phy/ti/phy-ti-pipe3.c b/drivers/phy/ti/phy-ti-pipe3.c index da2cbacb982c..ae764d6524c9 100644 --- a/drivers/phy/ti/phy-ti-pipe3.c +++ b/drivers/phy/ti/phy-ti-pipe3.c @@ -667,12 +667,20 @@ static int ti_pipe3_get_clk(struct ti_pipe3 *phy) return 0; } +static void ti_pipe3_put_device(void *_dev) +{ + struct device *dev = _dev; + + put_device(dev); +} + static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) { struct device *dev = phy->dev; struct device_node *node = dev->of_node; struct device_node *control_node; struct platform_device *control_pdev; + int ret; phy->phy_power_syscon = syscon_regmap_lookup_by_phandle(node, "syscon-phy-power"); @@ -704,6 +712,11 @@ static int ti_pipe3_get_sysctrl(struct ti_pipe3 *phy) } phy->control_dev = &control_pdev->dev; + + ret = devm_add_action_or_reset(dev, ti_pipe3_put_device, + phy->control_dev); + if (ret) + return ret; } if (phy->mode == PIPE3_MODE_PCIE) { From aac1256a41cfbbaca12d6c0a5753d1e3b8d2d8bf Mon Sep 17 00:00:00 2001 From: Ziyue Zhang Date: Fri, 25 Jul 2025 18:22:29 +0800 Subject: [PATCH 009/450] dt-bindings: phy: qcom,sc8280xp-qmp-pcie-phy: Update pcie phy bindings The gcc_aux_clk is required by the PCIe controller but not by the PCIe PHY. In PCIe PHY, the source of aux_clk used in low-power mode should be gcc_phy_aux_clk. Hence, remove gcc_aux_clk and replace it with gcc_phy_aux_clk. Fixes: fd2d4e4c1986 ("dt-bindings: phy: qcom,qmp: Add sa8775p QMP PCIe PHY") Signed-off-by: Ziyue Zhang Acked-by: Rob Herring (Arm) Reviewed-by: Johan Hovold Link: https://lore.kernel.org/r/20250725102231.3608298-2-ziyue.zhang@oss.qualcomm.com Signed-off-by: Vinod Koul --- .../devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml index a1ae8c7988c8..b6f140bf5b3b 100644 --- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml +++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml @@ -176,6 +176,8 @@ allOf: compatible: contains: enum: + - qcom,sa8775p-qmp-gen4x2-pcie-phy + - qcom,sa8775p-qmp-gen4x4-pcie-phy - qcom,sc8280xp-qmp-gen3x1-pcie-phy - qcom,sc8280xp-qmp-gen3x2-pcie-phy - qcom,sc8280xp-qmp-gen3x4-pcie-phy @@ -197,8 +199,6 @@ allOf: contains: enum: - qcom,qcs8300-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x2-pcie-phy - - qcom,sa8775p-qmp-gen4x4-pcie-phy then: properties: clocks: From 535fd4c98452c87537a40610abba45daf5761ec6 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Thu, 31 Jul 2025 08:44:50 -0400 Subject: [PATCH 010/450] serial: sc16is7xx: fix bug in flow control levels init When trying to set MCR[2], XON1 is incorrectly accessed instead. And when writing to the TCR register to configure flow control levels, we are incorrectly writing to the MSR register. The default value of $00 is then used for TCR, which means that selectable trigger levels in FCR are used in place of TCR. TCR/TLR access requires EFR[4] (enable enhanced functions) and MCR[2] to be set. EFR[4] is already set in probe(). MCR access requires LCR[7] to be zero. Since LCR is set to $BF when trying to set MCR[2], XON1 is incorrectly accessed instead because MCR shares the same address space as XON1. Since MCR[2] is unmodified and still zero, when writing to TCR we are in fact writing to MSR because TCR/TLR registers share the same address space as MSR/SPR. Fix by first removing useless reconfiguration of EFR[4] (enable enhanced functions), as it is already enabled in sc16is7xx_probe() since commit 43c51bb573aa ("sc16is7xx: make sure device is in suspend once probed"). Now LCR is $00, which means that MCR access is enabled. Also remove regcache_cache_bypass() calls since we no longer access the enhanced registers set, and TCR is already declared as volatile (in fact by declaring MSR as volatile, which shares the same address). Finally disable access to TCR/TLR registers after modifying them by clearing MCR[2]. Note: the comment about "... and internal clock div" is wrong and can be ignored/removed as access to internal clock div registers (DLL/DLH) is permitted only when LCR[7] is logic 1, not when enhanced features is enabled. And DLL/DLH access is not needed in sc16is7xx_startup(). Fixes: dfeae619d781 ("serial: sc16is7xx") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20250731124451.1108864-1-hugo@hugovil.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 3f38fba8f6ea..a668e0bb26b3 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1177,17 +1177,6 @@ static int sc16is7xx_startup(struct uart_port *port) sc16is7xx_port_write(port, SC16IS7XX_FCR_REG, SC16IS7XX_FCR_FIFO_BIT); - /* Enable EFR */ - sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, - SC16IS7XX_LCR_CONF_MODE_B); - - regcache_cache_bypass(one->regmap, true); - - /* Enable write access to enhanced features and internal clock div */ - sc16is7xx_port_update(port, SC16IS7XX_EFR_REG, - SC16IS7XX_EFR_ENABLE_BIT, - SC16IS7XX_EFR_ENABLE_BIT); - /* Enable TCR/TLR */ sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, @@ -1199,7 +1188,8 @@ static int sc16is7xx_startup(struct uart_port *port) SC16IS7XX_TCR_RX_RESUME(24) | SC16IS7XX_TCR_RX_HALT(48)); - regcache_cache_bypass(one->regmap, false); + /* Disable TCR/TLR access */ + sc16is7xx_port_update(port, SC16IS7XX_MCR_REG, SC16IS7XX_MCR_TCRTLR_BIT, 0); /* Now, initialize the UART */ sc16is7xx_port_write(port, SC16IS7XX_LCR_REG, SC16IS7XX_LCR_WORD_LEN_8); From ee047e1d85d73496541c54bd4f432c9464e13e65 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 12 Aug 2025 14:16:31 +0200 Subject: [PATCH 011/450] dt-bindings: serial: brcm,bcm7271-uart: Constrain clocks Lists should have fixed constraints, because binding must be specific in respect to hardware, thus add missing constraints to number of clocks. Cc: stable Fixes: 88a499cd70d4 ("dt-bindings: Add support for the Broadcom UART driver") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250812121630.67072-2-krzysztof.kozlowski@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml index 89c462653e2d..8cc848ae11cb 100644 --- a/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml +++ b/Documentation/devicetree/bindings/serial/brcm,bcm7271-uart.yaml @@ -41,7 +41,7 @@ properties: - const: dma_intr2 clocks: - minItems: 1 + maxItems: 1 clock-names: const: sw_baud From 387d00028cccee7575f6416953bef62f849d83e3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 12 Aug 2025 22:21:50 -0500 Subject: [PATCH 012/450] dt-bindings: serial: 8250: move a constraint A block that required a "spacemit,k1-uart" compatible node to specify two clocks was placed in the wrong spot in the binding. Conor Dooley pointed out it belongs earlier in the file, as part of the initial "allOf". Fixes: 2c0594f9f0629 ("dt-bindings: serial: 8250: support an optional second clock") Cc: stable Reported-by: Conor Dooley Closes: https://lore.kernel.org/lkml/20250729-reshuffle-contented-e6def76b540b@spud/ Signed-off-by: Alex Elder Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250813032151.2330616-1-elder@riscstar.com Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/serial/8250.yaml | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index e46bee8d25bf..f59c0b37e8eb 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -48,7 +48,6 @@ allOf: oneOf: - required: [ clock-frequency ] - required: [ clocks ] - - if: properties: compatible: @@ -66,6 +65,28 @@ allOf: items: - const: core - const: bus + - if: + properties: + compatible: + contains: + enum: + - spacemit,k1-uart + - nxp,lpc1850-uart + then: + required: + - clocks + - clock-names + properties: + clocks: + minItems: 2 + clock-names: + minItems: 2 + else: + properties: + clocks: + maxItems: 1 + clock-names: + maxItems: 1 properties: compatible: @@ -264,29 +285,6 @@ required: - reg - interrupts -if: - properties: - compatible: - contains: - enum: - - spacemit,k1-uart - - nxp,lpc1850-uart -then: - required: - - clocks - - clock-names - properties: - clocks: - minItems: 2 - clock-names: - minItems: 2 -else: - properties: - clocks: - maxItems: 1 - clock-names: - maxItems: 1 - unevaluatedProperties: false examples: From a1b51534b532dd4f0499907865553ee9251bebc3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 12 Aug 2025 22:13:35 -0500 Subject: [PATCH 013/450] dt-bindings: serial: 8250: allow "main" and "uart" as clock names There are two compatible strings defined in "8250.yaml" that require two clocks to be specified, along with their names: - "spacemit,k1-uart", used in "spacemit/k1.dtsi" - "nxp,lpc1850-uart", used in "lpc/lpc18xx.dtsi" When only one clock is used, the name is not required. However there are two places that do specify a name: - In "mediatek/mt7623.dtsi", the clock for the "mediatek,mtk-btif" compatible serial device is named "main" - In "qca/ar9132.dtsi", the clock for the "ns8250" compatible serial device is named "uart" In commit d2db0d7815444 ("dt-bindings: serial: 8250: allow clock 'uartclk' and 'reg' for nxp,lpc1850-uart"), Frank Li added the restriction that two named clocks be used for the NXP platform mentioned above. Change that logic, so that an additional condition for (only) the SpacemiT platform similarly restricts the two clocks to have the names "core" and "bus". Finally, add "main" and "uart" as allowed names when a single clock is specified. Fixes: 2c0594f9f0629 ("dt-bindings: serial: 8250: support an optional second clock") Cc: stable Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507160314.wrC51lXX-lkp@intel.com/ Signed-off-by: Alex Elder Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250813031338.2328392-1-elder@riscstar.com Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/serial/8250.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index f59c0b37e8eb..b243afa69a1a 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -59,7 +59,12 @@ allOf: items: - const: uartclk - const: reg - else: + - if: + properties: + compatible: + contains: + const: spacemit,k1-uart + then: properties: clock-names: items: @@ -183,6 +188,9 @@ properties: minItems: 1 maxItems: 2 oneOf: + - enum: + - main + - uart - items: - const: core - const: bus From 9969779d0803f5dcd4460ae7aca2bc3fd91bff12 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 014/450] Documentation/hw-vuln: Add VMSCAPE documentation VMSCAPE is a vulnerability that may allow a guest to influence the branch prediction in host userspace, particularly affecting hypervisors like QEMU. Add the documentation. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/vmscape.rst | 110 ++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/vmscape.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 89ca636081b7..55d747511f83 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -26,3 +26,4 @@ are configurable at compile, boot or run time. rsb old_microcode indirect-target-selection + vmscape diff --git a/Documentation/admin-guide/hw-vuln/vmscape.rst b/Documentation/admin-guide/hw-vuln/vmscape.rst new file mode 100644 index 000000000000..d9b9a2b6c114 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/vmscape.rst @@ -0,0 +1,110 @@ +.. SPDX-License-Identifier: GPL-2.0 + +VMSCAPE +======= + +VMSCAPE is a vulnerability that may allow a guest to influence the branch +prediction in host userspace. It particularly affects hypervisors like QEMU. + +Even if a hypervisor may not have any sensitive data like disk encryption keys, +guest-userspace may be able to attack the guest-kernel using the hypervisor as +a confused deputy. + +Affected processors +------------------- + +The following CPU families are affected by VMSCAPE: + +**Intel processors:** + - Skylake generation (Parts without Enhanced-IBRS) + - Cascade Lake generation - (Parts affected by ITS guest/host separation) + - Alder Lake and newer (Parts affected by BHI) + +Note that, BHI affected parts that use BHB clearing software mitigation e.g. +Icelake are not vulnerable to VMSCAPE. + +**AMD processors:** + - Zen series (families 0x17, 0x19, 0x1a) + +** Hygon processors:** + - Family 0x18 + +Mitigation +---------- + +Conditional IBPB +---------------- + +Kernel tracks when a CPU has run a potentially malicious guest and issues an +IBPB before the first exit to userspace after VM-exit. If userspace did not run +between VM-exit and the next VM-entry, no IBPB is issued. + +Note that the existing userspace mitigation against Spectre-v2 is effective in +protecting the userspace. They are insufficient to protect the userspace VMMs +from a malicious guest. This is because Spectre-v2 mitigations are applied at +context switch time, while the userspace VMM can run after a VM-exit without a +context switch. + +Vulnerability enumeration and mitigation is not applied inside a guest. This is +because nested hypervisors should already be deploying IBPB to isolate +themselves from nested guests. + +SMT considerations +------------------ + +When Simultaneous Multi-Threading (SMT) is enabled, hypervisors can be +vulnerable to cross-thread attacks. For complete protection against VMSCAPE +attacks in SMT environments, STIBP should be enabled. + +The kernel will issue a warning if SMT is enabled without adequate STIBP +protection. Warning is not issued when: + +- SMT is disabled +- STIBP is enabled system-wide +- Intel eIBRS is enabled (which implies STIBP protection) + +System information and options +------------------------------ + +The sysfs file showing VMSCAPE mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/vmscape + +The possible values in this file are: + + * 'Not affected': + + The processor is not vulnerable to VMSCAPE attacks. + + * 'Vulnerable': + + The processor is vulnerable and no mitigation has been applied. + + * 'Mitigation: IBPB before exit to userspace': + + Conditional IBPB mitigation is enabled. The kernel tracks when a CPU has + run a potentially malicious guest and issues an IBPB before the first + exit to userspace after VM-exit. + + * 'Mitigation: IBPB on VMEXIT': + + IBPB is issued on every VM-exit. This occurs when other mitigations like + RETBLEED or SRSO are already issuing IBPB on VM-exit. + +Mitigation control on the kernel command line +---------------------------------------------- + +The mitigation can be controlled via the ``vmscape=`` command line parameter: + + * ``vmscape=off``: + + Disable the VMSCAPE mitigation. + + * ``vmscape=ibpb``: + + Enable conditional IBPB mitigation (default when CONFIG_MITIGATION_VMSCAPE=y). + + * ``vmscape=force``: + + Force vulnerability detection and mitigation even on processors that are + not known to be affected. From a508cec6e5215a3fbc7e73ae86a5c5602187934d Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 015/450] x86/vmscape: Enumerate VMSCAPE bug The VMSCAPE vulnerability may allow a guest to cause Branch Target Injection (BTI) in userspace hypervisors. Kernels (both host and guest) have existing defenses against direct BTI attacks from guests. There are also inter-process BTI mitigations which prevent processes from attacking each other. However, the threat in this case is to a userspace hypervisor within the same process as the attacker. Userspace hypervisors have access to their own sensitive data like disk encryption keys and also typically have access to all guest data. This means guest userspace may use the hypervisor as a confused deputy to attack sensitive guest kernel data. There are no existing mitigations for these attacks. Introduce X86_BUG_VMSCAPE for this vulnerability and set it on affected Intel and AMD CPUs. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/common.c | 65 ++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 602957dd2609..b6fa5c33c85d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -550,4 +550,5 @@ #define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ #define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #define X86_BUG_TSA X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */ +#define X86_BUG_VMSCAPE X86_BUG( 1*32+10) /* "vmscape" CPU is affected by VMSCAPE attacks from guests */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 34a054181c4d..2b87c93e6609 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1236,6 +1236,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define ITS_NATIVE_ONLY BIT(9) /* CPU is affected by Transient Scheduler Attacks */ #define TSA BIT(10) +/* CPU is affected by VMSCAPE */ +#define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1247,44 +1249,55 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), - VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), - VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), - VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), - VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO | TSA), - VULNBL_AMD(0x1a, SRSO), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE), + VULNBL_AMD(0x1a, SRSO | VMSCAPE), {} }; @@ -1543,6 +1556,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } } + /* + * Set the bug only on bare-metal. A nested hypervisor should already be + * deploying IBPB to isolate itself from nested guests. + */ + if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + setup_force_cpu_bug(X86_BUG_VMSCAPE); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; From 2f8f173413f1cbf52660d04df92d0069c4306d25 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 016/450] x86/vmscape: Add conditional IBPB mitigation VMSCAPE is a vulnerability that exploits insufficient branch predictor isolation between a guest and a userspace hypervisor (like QEMU). Existing mitigations already protect kernel/KVM from a malicious guest. Userspace can additionally be protected by flushing the branch predictors after a VMexit. Since it is the userspace that consumes the poisoned branch predictors, conditionally issue an IBPB after a VMexit and before returning to userspace. Workloads that frequently switch between hypervisor and userspace will incur the most overhead from the new IBPB. This new IBPB is not integrated with the existing IBPB sites. For instance, a task can use the existing speculation control prctl() to get an IBPB at context switch time. With this implementation, the IBPB is doubled up: one at context switch and another before running userspace. The intent is to integrate and optimize these cases post-embargo. [ dhansen: elaborate on suboptimal IBPB solution ] Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Acked-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/entry-common.h | 7 +++++++ arch/x86/include/asm/nospec-branch.h | 2 ++ arch/x86/kernel/cpu/bugs.c | 8 ++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 5 files changed, 27 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b6fa5c33c85d..c8e177016cc4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -494,6 +494,7 @@ #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ /* * BUG word(s) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index d535a97c7284..ce3eb6d5fdf9 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -93,6 +93,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, * 8 (ia32) bits. */ choose_random_kstack_offset(rdtsc()); + + /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) && + this_cpu_read(x86_ibpb_exit_to_user)) { + indirect_branch_prediction_barrier(); + this_cpu_write(x86_ibpb_exit_to_user, false); + } } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 10f261678749..e29f82466f43 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -530,6 +530,8 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } +DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user); + static inline void indirect_branch_prediction_barrier(void) { asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b74bf937cd9f..410f8df8b77a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -105,6 +105,14 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); +/* + * Set when the CPU has run a potentially malicious guest. An IBPB will + * be needed to before running userspace. That IBPB will flush the branch + * predictor content. + */ +DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; static u64 __ro_after_init x86_arch_cap_msr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..58d19443c9a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11007,6 +11007,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, 0); + /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. From 556c1ad666ad90c50ec8fccb930dd5046cfbecfb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:42 -0700 Subject: [PATCH 017/450] x86/vmscape: Enable the mitigation Enable the previously added mitigation for VMscape. Add the cmdline vmscape={off|ibpb|force} and sysfs reporting. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen --- .../ABI/testing/sysfs-devices-system-cpu | 1 + .../admin-guide/kernel-parameters.txt | 11 +++ arch/x86/Kconfig | 9 ++ arch/x86/kernel/cpu/bugs.c | 90 +++++++++++++++++++ drivers/base/cpu.c | 3 + include/linux/cpu.h | 1 + 6 files changed, 115 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index ab8cd337f43a..8aed6d94c4cd 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -586,6 +586,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsa /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/vmscape Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 747a55abf494..5a7a83c411e9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3829,6 +3829,7 @@ srbds=off [X86,INTEL] ssbd=force-off [ARM64] tsx_async_abort=off [X86] + vmscape=off [X86] Exceptions: This does not have any effect on @@ -8041,6 +8042,16 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: + vmscape= [X86] Controls mitigation for VMscape attacks. + VMscape attacks can leak information from a userspace + hypervisor to a guest via speculative side-channels. + + off - disable the mitigation + ibpb - use Indirect Branch Prediction Barrier + (IBPB) mitigation (default) + force - force vulnerability detection even on + unaffected processors + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..52c8910ba2ef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2701,6 +2701,15 @@ config MITIGATION_TSA security vulnerability on AMD CPUs which can lead to forwarding of invalid info to subsequent instructions and thus can affect their timing and thereby cause a leakage. + +config MITIGATION_VMSCAPE + bool "Mitigate VMSCAPE" + depends on KVM + default y + help + Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security + vulnerability on Intel and AMD CPUs that may allow a guest to do + Spectre v2 style attacks on userspace hypervisor. endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 410f8df8b77a..c81024dfc4c8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -96,6 +96,9 @@ static void __init its_update_mitigation(void); static void __init its_apply_mitigation(void); static void __init tsa_select_mitigation(void); static void __init tsa_apply_mitigation(void); +static void __init vmscape_select_mitigation(void); +static void __init vmscape_update_mitigation(void); +static void __init vmscape_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -270,6 +273,7 @@ void __init cpu_select_mitigations(void) its_select_mitigation(); bhi_select_mitigation(); tsa_select_mitigation(); + vmscape_select_mitigation(); /* * After mitigations are selected, some may need to update their @@ -301,6 +305,7 @@ void __init cpu_select_mitigations(void) bhi_update_mitigation(); /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ srso_update_mitigation(); + vmscape_update_mitigation(); spectre_v1_apply_mitigation(); spectre_v2_apply_mitigation(); @@ -318,6 +323,7 @@ void __init cpu_select_mitigations(void) its_apply_mitigation(); bhi_apply_mitigation(); tsa_apply_mitigation(); + vmscape_apply_mitigation(); } /* @@ -3322,6 +3328,77 @@ static void __init srso_apply_mitigation(void) } } +#undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (cpu_mitigations_off() || + !boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); +} + #undef pr_fmt #define pr_fmt(fmt) fmt @@ -3570,6 +3647,11 @@ static ssize_t tsa_show_state(char *buf) return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); } +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -3636,6 +3718,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TSA: return tsa_show_state(buf); + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3727,6 +3812,11 @@ ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_TSA); } + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index efc575a00edd..008da0354fba 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -603,6 +603,7 @@ CPU_SHOW_VULN_FALLBACK(ghostwrite); CPU_SHOW_VULN_FALLBACK(old_microcode); CPU_SHOW_VULN_FALLBACK(indirect_target_selection); CPU_SHOW_VULN_FALLBACK(tsa); +CPU_SHOW_VULN_FALLBACK(vmscape); static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); @@ -622,6 +623,7 @@ static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL); static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL); static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL); +static DEVICE_ATTR(vmscape, 0444, cpu_show_vmscape, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -642,6 +644,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_old_microcode.attr, &dev_attr_indirect_target_selection.attr, &dev_attr_tsa.attr, + &dev_attr_vmscape.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b91b993f58ee..487b3bf2e1ea 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -83,6 +83,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev, extern ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From 6449f5baf9c78a7a442d64f4a61378a21c5db113 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:43 -0700 Subject: [PATCH 018/450] x86/bugs: Move cpu_bugs_smt_update() down cpu_bugs_smt_update() uses global variables from different mitigations. For SMT updates it can't currently use vmscape_mitigation that is defined after it. Since cpu_bugs_smt_update() depends on many other mitigations, move it after all mitigations are defined. With that, it can use vmscape_mitigation in a moment. No functional change. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen --- arch/x86/kernel/cpu/bugs.c | 165 +++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 82 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c81024dfc4c8..1f8c1c51d057 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -2551,88 +2551,6 @@ static void update_mds_branch_idle(void) } } -#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" -#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" - -void cpu_bugs_smt_update(void) -{ - mutex_lock(&spec_ctrl_mutex); - - if (sched_smt_active() && unprivileged_ebpf_enabled() && - spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); - - switch (spectre_v2_user_stibp) { - case SPECTRE_V2_USER_NONE: - break; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - update_stibp_strict(); - break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: - update_indir_branch_cond(); - break; - } - - switch (mds_mitigation) { - case MDS_MITIGATION_FULL: - case MDS_MITIGATION_AUTO: - case MDS_MITIGATION_VMWERV: - if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) - pr_warn_once(MDS_MSG_SMT); - update_mds_branch_idle(); - break; - case MDS_MITIGATION_OFF: - break; - } - - switch (taa_mitigation) { - case TAA_MITIGATION_VERW: - case TAA_MITIGATION_AUTO: - case TAA_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(TAA_MSG_SMT); - break; - case TAA_MITIGATION_TSX_DISABLED: - case TAA_MITIGATION_OFF: - break; - } - - switch (mmio_mitigation) { - case MMIO_MITIGATION_VERW: - case MMIO_MITIGATION_AUTO: - case MMIO_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(MMIO_MSG_SMT); - break; - case MMIO_MITIGATION_OFF: - break; - } - - switch (tsa_mitigation) { - case TSA_MITIGATION_USER_KERNEL: - case TSA_MITIGATION_VM: - case TSA_MITIGATION_AUTO: - case TSA_MITIGATION_FULL: - /* - * TSA-SQ can potentially lead to info leakage between - * SMT threads. - */ - if (sched_smt_active()) - static_branch_enable(&cpu_buf_idle_clear); - else - static_branch_disable(&cpu_buf_idle_clear); - break; - case TSA_MITIGATION_NONE: - case TSA_MITIGATION_UCODE_NEEDED: - break; - } - - mutex_unlock(&spec_ctrl_mutex); -} - #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt @@ -3402,6 +3320,89 @@ static void __init vmscape_apply_mitigation(void) #undef pr_fmt #define pr_fmt(fmt) fmt +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" +#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" From b7cc9887231526ca4fa89f3fa4119e47c2dc7b1e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 14 Aug 2025 10:20:43 -0700 Subject: [PATCH 019/450] x86/vmscape: Warn when STIBP is disabled with SMT Cross-thread attacks are generally harder as they require the victim to be co-located on a core. However, with VMSCAPE the adversary targets belong to the same guest execution, that are more likely to get co-located. In particular, a thread that is currently executing userspace hypervisor (after the IBPB) may still be targeted by a guest execution from a sibling thread. Issue a warning about the potential risk, except when: - SMT is disabled - STIBP is enabled system-wide - Intel eIBRS is enabled (which implies STIBP protection) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen --- arch/x86/kernel/cpu/bugs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1f8c1c51d057..fa32615db71d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -3400,6 +3400,28 @@ void cpu_bugs_smt_update(void) break; } + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. + * + * Intel eIBRS (!AUTOIBRS) implies STIBP on. + */ + if (!sched_smt_active() || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + break; + pr_warn_once(VMSCAPE_MSG_SMT); + break; + } + mutex_unlock(&spec_ctrl_mutex); } From 2b986b9e917bc88f81aa1ed386af63b26c983f1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 Aug 2025 20:24:37 +0200 Subject: [PATCH 020/450] bpf, cpumap: Disable page_pool direct xdp_return need larger scope When running an XDP bpf_prog on the remote CPU in cpumap code then we must disable the direct return optimization that xdp_return can perform for mem_type page_pool. This optimization assumes code is still executing under RX-NAPI of the original receiving CPU, which isn't true on this remote CPU. The cpumap code already disabled this via helpers xdp_set_return_frame_no_direct() and xdp_clear_return_frame_no_direct(), but the scope didn't include xdp_do_flush(). When doing XDP_REDIRECT towards e.g devmap this causes the function bq_xmit_all() to run with direct return optimization enabled. This can lead to hard to find bugs. The issue only happens when bq_xmit_all() cannot ndo_xdp_xmit all frames and them frees them via xdp_return_frame_rx_napi(). Fix by expanding scope to include xdp_do_flush(). This was found by Dragos Tatulea. Fixes: 11941f8a8536 ("bpf: cpumap: Implement generic cpumap") Reported-by: Dragos Tatulea Reported-by: Chris Arges Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: Chris Arges Link: https://patch.msgid.link/175519587755.3008742.1088294435150406835.stgit@firesoul --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a..c46360b27871 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); From e4414b01c1cd9887bbde92f946c1ba94e40d6d64 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Aug 2025 22:06:55 +0200 Subject: [PATCH 021/450] bpf: Check the helper function is valid in get_helper_proto kernel test robot reported verifier bug [1] where the helper func pointer could be NULL due to disabled config option. As Alexei suggested we could check on that in get_helper_proto directly. Marking tail_call helper func with BPF_PTR_POISON, because it is unused by design. [1] https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com Reported-by: kernel test robot Reported-by: syzbot+a9ed3d9132939852d0df@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250814200655.945632-1-jolsa@kernel.org Closes: https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com --- kernel/bpf/core.c | 5 ++++- kernel/bpf/verifier.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..f8ac77d08ca7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3024,7 +3024,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..c89e2b1bc644 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11354,7 +11354,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, From cfd956dcb101aa3d25bac321fae923323a47c607 Mon Sep 17 00:00:00 2001 From: Fabian Vogt Date: Fri, 15 Aug 2025 13:33:28 +0200 Subject: [PATCH 022/450] tty: hvc_console: Call hvc_kick in hvc_write unconditionally After hvc_write completes, call hvc_kick also in the case the output buffer has been drained, to ensure tty_wakeup gets called. This fixes that functions which wait for a drained buffer got stuck occasionally. Cc: stable Closes: https://bugzilla.opensuse.org/show_bug.cgi?id=1230062 Signed-off-by: Fabian Vogt Link: https://lore.kernel.org/r/2011735.PYKUYFuaPT@fvogt-thinkpad Signed-off-by: Greg Kroah-Hartman --- drivers/tty/hvc/hvc_console.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c index cd1f657f782d..13c663a154c4 100644 --- a/drivers/tty/hvc/hvc_console.c +++ b/drivers/tty/hvc/hvc_console.c @@ -543,10 +543,10 @@ static ssize_t hvc_write(struct tty_struct *tty, const u8 *buf, size_t count) } /* - * Racy, but harmless, kick thread if there is still pending data. + * Kick thread to flush if there's still pending data + * or to wakeup the write queue. */ - if (hp->n_outbuf) - hvc_kick(); + hvc_kick(); return written; } From f63aaf6e71de897954fbde4e4a17a9dcdbe5e7e1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Aug 2025 10:20:22 +0200 Subject: [PATCH 023/450] clk: renesas: mstp: Add genpd OF provider at postcore_initcall() Genpd OF providers must now be registered after genpd bus registration. However, cpg_mstp_add_clk_domain() is only called from CLK_OF_DECLARE(), which is too early. Hence on R-Car M1A, R-Car H1, and RZ/A1, the CPG/MSTP Clock Domain fails to register, and any devices residing in that clock domain fail to probe. Fix this by splitting initialization into two steps: - The first part keeps on registering the PM domain with genpd at CLK_OF_DECLARE(), - The second and new part moves the registration of the genpd OF provider to a postcore_initcall(). See also commit c5ae5a0c61120d0c ("pmdomain: renesas: rcar-sysc: Add genpd OF provider at postcore_initcall"). Fixes: 18a3a510ecfd0e50 ("pmdomain: core: Add the genpd->dev to the genpd provider bus") Signed-off-by: Geert Uytterhoeven Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/81ef5f8d5d31374b7852b05453c52d2f735062a2.1755073087.git.geert+renesas@glider.be --- drivers/clk/renesas/clk-mstp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/clk/renesas/clk-mstp.c b/drivers/clk/renesas/clk-mstp.c index 5bc473c2adb3..2f65fe2c6bdf 100644 --- a/drivers/clk/renesas/clk-mstp.c +++ b/drivers/clk/renesas/clk-mstp.c @@ -303,6 +303,9 @@ void cpg_mstp_detach_dev(struct generic_pm_domain *unused, struct device *dev) pm_clk_destroy(dev); } +static struct device_node *cpg_mstp_pd_np __initdata = NULL; +static struct generic_pm_domain *cpg_mstp_pd_genpd __initdata = NULL; + void __init cpg_mstp_add_clk_domain(struct device_node *np) { struct generic_pm_domain *pd; @@ -324,5 +327,20 @@ void __init cpg_mstp_add_clk_domain(struct device_node *np) pd->detach_dev = cpg_mstp_detach_dev; pm_genpd_init(pd, &pm_domain_always_on_gov, false); - of_genpd_add_provider_simple(np, pd); + cpg_mstp_pd_np = of_node_get(np); + cpg_mstp_pd_genpd = pd; } + +static int __init cpg_mstp_pd_init_provider(void) +{ + int error; + + if (!cpg_mstp_pd_np) + return -ENODEV; + + error = of_genpd_add_provider_simple(cpg_mstp_pd_np, cpg_mstp_pd_genpd); + + of_node_put(cpg_mstp_pd_np); + return error; +} +postcore_initcall(cpg_mstp_pd_init_provider); From d8f3ae7b38fea546e64a6cfcdc7d061c85f086e2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Aug 2025 15:04:24 +0200 Subject: [PATCH 024/450] pmdomain: renesas: rcar-sysc: Make rcar_sysc_onecell_np __initdata rcar_sysc_onecell_np() is only used by functions marked __init, so it can be freed when init memory is freed. Fixes: c5ae5a0c6112 ("pmdomain: renesas: rcar-sysc: Add genpd OF provider at postcore_initcall") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/e20a848ff952924f8f58c335f9a0242cb2565921.1755090234.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-sysc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c index 4b310c1d35fa..2d4161170c63 100644 --- a/drivers/pmdomain/renesas/rcar-sysc.c +++ b/drivers/pmdomain/renesas/rcar-sysc.c @@ -342,7 +342,7 @@ struct rcar_pm_domains { }; static struct genpd_onecell_data *rcar_sysc_onecell_data; -static struct device_node *rcar_sysc_onecell_np; +static struct device_node *rcar_sysc_onecell_np __initdata = NULL; static int __init rcar_sysc_pd_init(void) { From d072148a8631f102de60ed5a3a827e85d09d24f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Aug 2025 10:25:00 +0200 Subject: [PATCH 025/450] fs: add a FMODE_ flag to indicate IOCB_HAS_METADATA availability Currently the kernel will happily route io_uring requests with metadata to file operations that don't support it. Add a FMODE_ flag to guard that. Fixes: 4de2ce04c862 ("fs: introduce IOCB_HAS_METADATA for metadata") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250819082517.2038819-2-hch@lst.de Signed-off-by: Christian Brauner --- block/fops.c | 3 +++ include/linux/fs.h | 3 ++- io_uring/rw.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25..08e7c21bd9f1 100644 --- a/block/fops.c +++ b/block/fops.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -687,6 +688,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + if (blk_get_integrity(bdev->bd_disk)) + filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..601d036a6c78 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -149,7 +149,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* FMODE_* bit 13 */ +/* Supports IOCB_HAS_METADATA */ +#define FMODE_HAS_METADATA ((__force fmode_t)(1 << 13)) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) diff --git a/io_uring/rw.c b/io_uring/rw.c index 52a5b950b2e5..af5a54b5db12 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -886,6 +886,9 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; + if (!(file->f_mode & FMODE_HAS_METADATA)) + return -EINVAL; + /* * We have a union of meta fields with wpq used for buffered-io * in io_async_rw, so fail it here. From 2729a60bbfb9215997f25372ebe9b7964f038296 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Aug 2025 10:25:01 +0200 Subject: [PATCH 026/450] block: don't silently ignore metadata for sync read/write The block fops don't try to handle metadata for synchronous requests, probably because the completion handler looks at dio->iocb which is not valid for synchronous requests. But silently ignoring metadata (or warning in case of __blkdev_direct_IO_simple) is a really bad idea as that can cause silent data corruption if a user ever shows up. Instead simply handle metadata for synchronous requests as the completion handler can simply check for bio_integrity() as the block layer default integrity will already be freed at this point, and thus bio_integrity() will only return true for user mapped integrity. Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250819082517.2038819-3-hch@lst.de Reviewed-by: Martin K. Petersen Signed-off-by: Christian Brauner --- block/fops.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index 08e7c21bd9f1..ddbc69c0922b 100644 --- a/block/fops.c +++ b/block/fops.c @@ -55,7 +55,6 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -132,7 +131,7 @@ static void blkdev_bio_end_io(struct bio *bio) if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { @@ -234,7 +233,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } bio->bi_opf |= REQ_NOWAIT; } - if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; @@ -302,7 +301,7 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } - if (iocb->ki_flags & IOCB_HAS_METADATA) + if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); @@ -423,7 +422,8 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (likely(nr_pages <= BIO_MAX_VECS)) { + if (likely(nr_pages <= BIO_MAX_VECS && + !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); From f41c538881eec4dcf5961a242097d447f848cda6 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Tue, 29 Jul 2025 23:03:12 +0800 Subject: [PATCH 027/450] dmaengine: idxd: Remove improper idxd_free The call to idxd_free() introduces a duplicate put_device() leading to a reference count underflow: refcount_t: underflow; use-after-free. WARNING: CPU: 15 PID: 4428 at lib/refcount.c:28 refcount_warn_saturate+0xbe/0x110 ... Call Trace: idxd_remove+0xe4/0x120 [idxd] pci_device_remove+0x3f/0xb0 device_release_driver_internal+0x197/0x200 driver_detach+0x48/0x90 bus_remove_driver+0x74/0xf0 pci_unregister_driver+0x2e/0xb0 idxd_exit_module+0x34/0x7a0 [idxd] __do_sys_delete_module.constprop.0+0x183/0x280 do_syscall_64+0x54/0xd70 entry_SYSCALL_64_after_hwframe+0x76/0x7e The idxd_unregister_devices() which is invoked at the very beginning of idxd_remove(), already takes care of the necessary put_device() through the following call path: idxd_unregister_devices() -> device_unregister() -> put_device() In addition, when CONFIG_DEBUG_KOBJECT_RELEASE is enabled, put_device() may trigger asynchronous cleanup via schedule_delayed_work(). If idxd_free() is called immediately after, it can result in a use-after-free. Remove the improper idxd_free() to avoid both the refcount underflow and potential memory corruption during module unload. Fixes: d5449ff1b04d ("dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call") Signed-off-by: Yi Sun Tested-by: Shuai Xue Reviewed-by: Dave Jiang Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20250729150313.1934101-2-yi.sun@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 35bdefd3728b..487268fc527b 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1294,7 +1294,6 @@ static void idxd_remove(struct pci_dev *pdev) idxd_cleanup(idxd); pci_iounmap(pdev, idxd->reg_base); put_device(idxd_confdev(idxd)); - idxd_free(idxd); pci_disable_device(pdev); } From b7cb9a034305d52222433fad10c3de10204f29e7 Mon Sep 17 00:00:00 2001 From: Yi Sun Date: Tue, 29 Jul 2025 23:03:13 +0800 Subject: [PATCH 028/450] dmaengine: idxd: Fix refcount underflow on module unload A recent refactor introduced a misplaced put_device() call, resulting in a reference count underflow during module unload. There is no need to add additional put_device() calls for idxd groups, engines, or workqueues. Although the commit claims: "Note, this also fixes the missing put_device() for idxd groups, engines, and wqs." It appears no such omission actually existed. The required cleanup is already handled by the call chain: idxd_unregister_devices() -> device_unregister() -> put_device() Extend idxd_cleanup() to handle the remaining necessary cleanup and remove idxd_cleanup_internals(), which duplicates deallocation logic for idxd, engines, groups, and workqueues. Memory management is also properly handled through the Linux device model. Fixes: a409e919ca32 ("dmaengine: idxd: Refactor remove call with idxd_cleanup() helper") Signed-off-by: Yi Sun Tested-by: Shuai Xue Reviewed-by: Dave Jiang Acked-by: Vinicius Costa Gomes Link: https://lore.kernel.org/r/20250729150313.1934101-3-yi.sun@intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 487268fc527b..efe614a528fa 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -1291,7 +1291,10 @@ static void idxd_remove(struct pci_dev *pdev) device_unregister(idxd_confdev(idxd)); idxd_shutdown(pdev); idxd_device_remove_debugfs(idxd); - idxd_cleanup(idxd); + perfmon_pmu_remove(idxd); + idxd_cleanup_interrupts(idxd); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); pci_iounmap(pdev, idxd->reg_base); put_device(idxd_confdev(idxd)); pci_disable_device(pdev); From 39aaa337449e71a41d4813be0226a722827ba606 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Aug 2025 13:43:39 +0300 Subject: [PATCH 029/450] dmaengine: idxd: Fix double free in idxd_setup_wqs() The clean up in idxd_setup_wqs() has had a couple bugs because the error handling is a bit subtle. It's simpler to just re-write it in a cleaner way. The issues here are: 1) If "idxd->max_wqs" is <= 0 then we call put_device(conf_dev) when "conf_dev" hasn't been initialized. 2) If kzalloc_node() fails then again "conf_dev" is invalid. It's either uninitialized or it points to the "conf_dev" from the previous iteration so it leads to a double free. It's better to free partial loop iterations within the loop and then the unwinding at the end can handle whole loop iterations. I also renamed the labels to describe what the goto does and not where the goto was located. Fixes: 3fd2f4bc010c ("dmaengine: idxd: fix memory leak in error handling path of idxd_setup_wqs") Reported-by: Colin Ian King Closes: https://lore.kernel.org/all/20250811095836.1642093-1-colin.i.king@gmail.com/ Signed-off-by: Dan Carpenter Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/aJnJW3iYTDDCj9sk@stanley.mountain Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index efe614a528fa..8c4725ad1f64 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -189,27 +189,30 @@ static int idxd_setup_wqs(struct idxd_device *idxd) idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); if (!idxd->wq_enable_map) { rc = -ENOMEM; - goto err_bitmap; + goto err_free_wqs; } for (i = 0; i < idxd->max_wqs; i++) { wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev)); if (!wq) { rc = -ENOMEM; - goto err; + goto err_unwind; } idxd_dev_set_type(&wq->idxd_dev, IDXD_DEV_WQ); conf_dev = wq_confdev(wq); wq->id = i; wq->idxd = idxd; - device_initialize(wq_confdev(wq)); + device_initialize(conf_dev); conf_dev->parent = idxd_confdev(idxd); conf_dev->bus = &dsa_bus_type; conf_dev->type = &idxd_wq_device_type; rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); - if (rc < 0) - goto err; + if (rc < 0) { + put_device(conf_dev); + kfree(wq); + goto err_unwind; + } mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); @@ -220,15 +223,20 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); if (!wq->wqcfg) { + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err; + goto err_unwind; } if (idxd->hw.wq_cap.op_config) { wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); if (!wq->opcap_bmap) { + kfree(wq->wqcfg); + put_device(conf_dev); + kfree(wq); rc = -ENOMEM; - goto err_opcap_bmap; + goto err_unwind; } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } @@ -239,13 +247,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return 0; -err_opcap_bmap: - kfree(wq->wqcfg); - -err: - put_device(conf_dev); - kfree(wq); - +err_unwind: while (--i >= 0) { wq = idxd->wqs[i]; if (idxd->hw.wq_cap.op_config) @@ -254,11 +256,10 @@ static int idxd_setup_wqs(struct idxd_device *idxd) conf_dev = wq_confdev(wq); put_device(conf_dev); kfree(wq); - } bitmap_free(idxd->wq_enable_map); -err_bitmap: +err_free_wqs: kfree(idxd->wqs); return rc; From 41a86f62424ac436cb51e3de612ef1e1ddb0c873 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Wed, 20 Aug 2025 21:34:24 +0800 Subject: [PATCH 030/450] fs: fix indentation style Replace 8 leading spaces with a tab to follow kernel coding style. Signed-off-by: Guopeng Zhang Link: https://lore.kernel.org/20250820133424.1667467-1-zhangguopeng@kylinos.cn Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b184..51f77c65c0c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2455,7 +2455,7 @@ struct vfsmount *clone_private_mount(const struct path *path) return ERR_PTR(-EINVAL); } - if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) From 16c07342b5425b86547146a6e51d9e32cee8d300 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 20 Aug 2025 18:33:11 -0500 Subject: [PATCH 031/450] gpiolib: acpi: Program debounce when finding GPIO When soc-button-array looks up the GPIO to use it calls acpi_find_gpio() which will parse _CRS. acpi_find_gpio.cold (drivers/gpio/gpiolib-acpi-core.c:953) gpiod_find_and_request (drivers/gpio/gpiolib.c:4598 drivers/gpio/gpiolib.c:4625) gpiod_get_index (drivers/gpio/gpiolib.c:4877) The GPIO is setup basically, but the debounce information is discarded. The platform will assert what debounce should be in _CRS, so program it at the time it's available. Reviewed-by: Mika Westerberg Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c index 12b24a717e43..e81a29902bb2 100644 --- a/drivers/gpio/gpiolib-acpi-core.c +++ b/drivers/gpio/gpiolib-acpi-core.c @@ -944,6 +944,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, bool can_fallback = acpi_can_fallback_to_crs(adev, con_id); struct acpi_gpio_info info; struct gpio_desc *desc; + int ret; desc = __acpi_find_gpio(fwnode, con_id, idx, can_fallback, &info); if (IS_ERR(desc)) @@ -957,6 +958,12 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, acpi_gpio_update_gpiod_flags(dflags, &info); acpi_gpio_update_gpiod_lookup_flags(lookupflags, &info); + + /* ACPI uses hundredths of milliseconds units */ + ret = gpio_set_debounce_timeout(desc, info.debounce * 10); + if (ret) + return ERR_PTR(ret); + return desc; } From be1e0283021ec73c2eb92839db9a471a068709d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 21 Aug 2025 13:50:47 +0200 Subject: [PATCH 032/450] coredump: don't pointlessly check and spew warnings When a write happens it doesn't make sense to check perform checks on the input. Skip them. Whether a fixes tag is licensed is a bit of a gray area here but I'll add one for the socket validation part I added recently. Link: https://lore.kernel.org/20250821-moosbedeckt-denunziant-7908663f3563@brauner Fixes: 16195d2c7dd2 ("coredump: validate socket name as it is written") Reported-by: Brad Spengler Signed-off-by: Christian Brauner --- fs/coredump.c | 4 ++++ fs/exec.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc..60bc9685e149 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1466,11 +1466,15 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; + if (write) + return proc_dostring(table, write, buffer, lenp, ppos); + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); error = proc_dostring(table, write, buffer, lenp, ppos); if (error) return error; + if (!check_coredump_socket()) { strscpy(core_pattern, old_core_pattern, retval + 1); return -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..e861a4b7ffda 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) + if (!error && !write) validate_coredump_safety(); return error; } From 220abf77e7c2835cc63ea8cd7158cf83952640af Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 21 Aug 2025 09:56:38 +0530 Subject: [PATCH 033/450] cpufreq/amd-pstate: Fix setting of CPPC.min_perf in active mode for performance governor In the "active" mode of the amd-pstate driver with performance governor, the CPPC.min_perf is expected to be the nominal_perf. However after commit a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies"), this is not the case when the governor is switched from performance to powersave and back to performance, and the CPPC.min_perf will be equal to the scaling_min_freq that was set for the powersave governor. This is because prior to commit a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies"), amd_pstate_epp_update_limit() would unconditionally call amd_pstate_update_min_max_limit() and the latter function would enforce the CPPC.min_perf constraint when the governor is performance. However, after the aforementioned commit, amd_pstate_update_min_max_limit() is called by amd_pstate_epp_update_limit() only when either the scaling_{min/max}_freq is different from the cached value of cpudata->{min/max}_limit_freq, which wouldn't have changed on a governor transition from powersave to performance, thus missing out on enforcing the CPPC.min_perf constraint for the performance governor. Fix this by invoking amd_pstate_epp_udpate_limit() not only when the {min/max} limits have changed from the cached values, but also when the policy itself has changed. Fixes: a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies") Signed-off-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250821042638.356-1-gautham.shenoy@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bbc27ef9edf7..5cd91489fcbe 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1554,13 +1554,15 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) pr_debug("CPU %d exiting\n", policy->cpu); } -static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) +static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy, bool policy_change) { struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf; u8 epp; - if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + if (policy_change || + policy->min != cpudata->min_limit_freq || + policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) @@ -1584,7 +1586,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) cpudata->policy = policy->policy; - ret = amd_pstate_epp_update_limit(policy); + ret = amd_pstate_epp_update_limit(policy, true); if (ret) return ret; @@ -1658,7 +1660,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) int ret; /* enable amd pstate from suspend state*/ - ret = amd_pstate_epp_update_limit(policy); + ret = amd_pstate_epp_update_limit(policy, false); if (ret) return ret; From 79f919a89c9d06816dbdbbd168fa41d27411a7f9 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 19 Aug 2025 01:07:24 +0000 Subject: [PATCH 034/450] cgroup: split cgroup_destroy_wq into 3 workqueues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hung task can occur during [1] LTP cgroup testing when repeatedly mounting/unmounting perf_event and net_prio controllers with systemd.unified_cgroup_hierarchy=1. The hang manifests in cgroup_lock_and_drain_offline() during root destruction. Related case: cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio Call Trace: cgroup_lock_and_drain_offline+0x14c/0x1e8 cgroup_destroy_root+0x3c/0x2c0 css_free_rwork_fn+0x248/0x338 process_one_work+0x16c/0x3b8 worker_thread+0x22c/0x3b0 kthread+0xec/0x100 ret_from_fork+0x10/0x20 Root Cause: CPU0 CPU1 mount perf_event umount net_prio cgroup1_get_tree cgroup_kill_sb rebind_subsystems // root destruction enqueues // cgroup_destroy_wq // kill all perf_event css // one perf_event css A is dying // css A offline enqueues cgroup_destroy_wq // root destruction will be executed first css_free_rwork_fn cgroup_destroy_root cgroup_lock_and_drain_offline // some perf descendants are dying // cgroup_destroy_wq max_active = 1 // waiting for css A to die Problem scenario: 1. CPU0 mounts perf_event (rebind_subsystems) 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work 3. A dying perf_event CSS gets queued for offline after root destruction 4. Root destruction waits for offline completion, but offline work is blocked behind root destruction in cgroup_destroy_wq (max_active=1) Solution: Split cgroup_destroy_wq into three dedicated workqueues: cgroup_offline_wq – Handles CSS offline operations cgroup_release_wq – Manages resource release cgroup_free_wq – Performs final memory deallocation This separation eliminates blocking in the CSS free path while waiting for offline operations to complete. [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Reported-by: Gao Yingjie Signed-off-by: Chen Ridong Suggested-by: Teju Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 43 +++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..79b1d79f86a3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -5558,7 +5581,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5590,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5724,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5962,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6325,8 +6348,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); From 94a4acfec14615e971eb2c9e1fa6c992c85ff6c6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 22 Aug 2025 07:07:15 +0000 Subject: [PATCH 035/450] cgroup/psi: Set of->priv to NULL upon file release Setting of->priv to NULL when the file is released enables earlier bug detection. This allows potential bugs to manifest as NULL pointer dereferences rather than use-after-free errors[1], which are generally more difficult to diagnose. [1] https://lore.kernel.org/cgroups/38ef3ff9-b380-44f0-9315-8b3714b0948d@huaweicloud.com/T/#m8a3b3f88f0ff3da5925d342e90043394f8b2091b Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 79b1d79f86a3..77d02f87f3f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4182,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, From cba70aff623b104085ab5613fedd21f6ea19095a Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 6 Aug 2025 14:09:26 +0200 Subject: [PATCH 036/450] USB: serial: option: add Telit Cinterion FN990A w/audio compositions Add the following Telit Cinterion FN990A w/audio compositions: 0x1077: tty (diag) + adb + rmnet + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1077 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 4 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=03(O) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 5 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 6 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1078: tty (diag) + adb + MBIM + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 21 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1078 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=11 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#=10 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 3 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 5 Alt= 0 #EPs= 0 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio I: If#= 6 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1079: RNDIS + tty (diag) + adb + audio + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=09 Cnt=01 Dev#= 23 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1079 Rev=05.04 S: Manufacturer=Telit Wireless Solutions S: Product=FN990 S: SerialNumber=67e04c35 C: #Ifs=11 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host E: Ad=81(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#=10 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8c(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 0 Cls=01(audio) Sub=01 Prot=20 Driver=snd-usb-audio I: If#= 5 Alt= 0 #EPs= 0 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio I: If#= 6 Alt= 1 #EPs= 1 Cls=01(audio) Sub=02 Prot=20 Driver=snd-usb-audio E: Ad=84(I) Atr=0d(Isoc) MxPS= 68 Ivl=1ms I: If#= 7 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 8 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 9 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index e5cd33093423..1f4da8120111 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1369,6 +1369,12 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990A (PCIe) */ .driver_info = RSVD(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1077, 0xff), /* Telit FN990A (rmnet + audio) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1078, 0xff), /* Telit FN990A (MBIM + audio) */ + .driver_info = NCTRL(0) | RSVD(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1079, 0xff), /* Telit FN990A (RNDIS + audio) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff), /* Telit FE990A (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1081, 0xff), /* Telit FE990A (MBIM) */ From 57834ce5a6a47df282c8419019ba5495eac58fb9 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Aug 2025 19:00:03 +0200 Subject: [PATCH 037/450] s390/mm: Prevent possible preempt_count overflow The s390 implementation of ptep_modify_prot_start() currently does preempt_disable(), and the preempt_enable() is done later in ptep_modify_prot_commit(). This logic is not really required, because the PTE lock must be held over the complete prot_start/commit transaction, as described in the comment of the generic implementation of ptep_modify_prot_start(). That comment also mentions that this interface should be batchable, and modify_prot_start_ptes() might start a transaction over a batch of PTEs, implemented as a simple loop over ptep_modify_prot_start(). In this case, the preempt_disable() in ptep_modify_prot_start() would be called multiple times, before the corresponding preempt_enable() calls happen, and this can lead to a preempt_count overflow. To fix this, simply remove the preempt_disable/enable() calls in ptep_modify_prot_start/commit(), and rely on the PTE lock being held. Commit cac1db8c3aad ("mm: optimize mprotect() by PTE batching") made use of this PTE batching for the first time, and triggers warnings like this: DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10) BUG: sleeping function called from invalid context at mm/mprotect.c:576 Hence, add a Fixes tag on that commit. Not because it is broken, but to make sure that it won't get backported w/o also this fix for s390. Fixes: cac1db8c3aad ("mm: optimize mprotect() by PTE batching") Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Alexander Gordeev --- arch/s390/mm/pgtable.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 60688be4e876..50eb57c976bc 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -335,7 +335,6 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, int nodat; struct mm_struct *mm = vma->vm_mm; - preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); old = ptep_flush_lazy(mm, addr, ptep, nodat); @@ -360,7 +359,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, } else { set_pte(ptep, pte); } - preempt_enable(); } static inline void pmdp_idte_local(struct mm_struct *mm, From a5a261bea9bf8444300d1067b4a73bedee5b5227 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Fri, 22 Aug 2025 11:08:39 +0200 Subject: [PATCH 038/450] USB: serial: option: add Telit Cinterion LE910C4-WWX new compositions Add the following Telit Cinterion LE910C4-WWX new compositions: 0x1034: tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1034 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1036: tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1036 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1037: tty (diag) + tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 15 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1037 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x1038: tty (Telit custom) + tty (AT) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 9 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1038 Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x103b: tty (diag) + tty (Telit custom) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=103b Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 4 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x103c: tty (Telit custom) + tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=103c Rev=00.00 S: Manufacturer=Telit S: Product=LE910C4-WWX S: SerialNumber=93f617e7 C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=fe Prot=ff Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 1f4da8120111..fc869b7f803f 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1322,7 +1322,18 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1033, 0xff), /* Telit LE910C1-EUX (ECM) */ .driver_info = NCTRL(0) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1034, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = RSVD(2) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1035, 0xff) }, /* Telit LE910C4-WWX (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1036, 0xff) }, /* Telit LE910C4-WWX */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1037, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | NCTRL(1) | RSVD(4) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1038, 0xff), /* Telit LE910C4-WWX (rmnet) */ + .driver_info = NCTRL(0) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103b, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) | NCTRL(1) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x103c, 0xff), /* Telit LE910C4-WWX */ + .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG0), .driver_info = RSVD(0) | RSVD(1) | NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG1), From e9c8da670e749f7dedc53e3af54a87b041918092 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jul 2025 12:08:30 +0200 Subject: [PATCH 039/450] fuse: do not allow mapping a non-regular backing file We do not support passthrough operations other than read/write on regular file, so allowing non-regular backing files makes no sense. Fixes: efad7153bf93 ("fuse: allow O_PATH fd for FUSE_DEV_IOC_BACKING_OPEN") Cc: stable@vger.kernel.org Signed-off-by: Amir Goldstein Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/passthrough.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 607ef735ad4a..eb97ac009e75 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -237,6 +237,11 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (!file) goto out; + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + backing_sb = file_inode(file)->i_sb; res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) From e5203209b3935041dac541bc5b37efb44220cc0b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Aug 2025 14:07:54 +0200 Subject: [PATCH 040/450] fuse: check if copy_file_range() returns larger than requested size Just like write(), copy_file_range() should check if the return value is less or equal to the requested number of bytes. Reported-by: Chunsheng Luo Closes: https://lore.kernel.org/all/20250807062425.694-1-luochunsheng@ustc.edu/ Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5525a4520b0f..45207a6bb85f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3026,6 +3026,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, fc->no_copy_file_range = 1; err = -EOPNOTSUPP; } + if (!err && outarg.size > len) + err = -EIO; + if (err) goto out; From 1e08938c3694f707bb165535df352ac97a8c75c9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Aug 2025 14:46:34 +0200 Subject: [PATCH 041/450] fuse: prevent overflow in copy_file_range return value The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies. Currently the number of bytes copied is silently truncated to 32-bit, which may result in poor performance or even failure to copy in case of truncation to zero. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 45207a6bb85f..4adcf09d4b01 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,7 +2960,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = len, + .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), .flags = flags }; struct fuse_write_out outarg; From 79569946502258ef53984f3000bffa77e469d8dc Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Aug 2025 11:25:38 -0700 Subject: [PATCH 042/450] fuse: reflect cached blocksize if blocksize was changed As pointed out by Miklos[1], in the fuse_update_get_attr() path, the attributes returned to stat may be cached values instead of fresh ones fetched from the server. In the case where the server returned a modified blocksize value, we need to cache it and reflect it back to stat if values are not re-fetched since we now no longer directly change inode->i_blkbits. Link: https://lore.kernel.org/linux-fsdevel/CAJfpeguCOxeVX88_zPd1hqziB_C+tmfuDhZP5qO2nKmnb-dTUA@mail.gmail.com/ [1] Fixes: 542ede096e48 ("fuse: keep inode->i_blkbits constant") Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2d817d7cab26..ebee7e0b1cd3 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1377,6 +1377,7 @@ static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + stat->blksize = 1 << fi->cached_i_blkbits; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ec248d13c8bf..1647eb7ca6fa 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -210,6 +210,12 @@ struct fuse_inode { /** Reference to backing file in passthrough mode */ struct fuse_backing *fb; #endif + + /* + * The underlying inode->i_blkbits value will not be modified, + * so preserve the blocksize specified by the server. + */ + u8 cached_i_blkbits; }; /** FUSE inode state bits */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67c2318bfc42..3bfd83469d9f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,6 +289,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } + if (attr->blksize) + fi->cached_i_blkbits = ilog2(attr->blksize); + else + fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; + /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the From bd24d2108e9c8459d2c9f3d6d910b0053887df57 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 15 Aug 2025 11:25:39 -0700 Subject: [PATCH 043/450] fuse: fix fuseblk i_blkbits for iomap partial writes On regular fuse filesystems, i_blkbits is set to PAGE_SHIFT which means any iomap partial writes will mark the entire folio as uptodate. However fuseblk filesystems work differently and allow the blocksize to be less than the page size. As such, this may lead to data corruption if fuseblk sets its blocksize to less than the page size, uses the writeback cache, and does a partial write, then a read and the read happens before the write has undergone writeback, since the folio will not be marked uptodate from the partial write so the read will read in the entire folio from disk, which will overwrite the partial write. The long-term solution for this, which will also be needed for fuse to enable large folios with the writeback cache on, is to have fuse also use iomap for folio reads, but until that is done, the cleanest workaround is to use the page size for fuseblk's internal kernel inode blksize/blkbits values while maintaining current behavior for stat(). This was verified using ntfs-3g: $ sudo mkfs.ntfs -f -c 512 /dev/vdd1 $ sudo ntfs-3g /dev/vdd1 ~/fuseblk $ stat ~/fuseblk/hi.txt IO Block: 512 Fixes: a4c9ab1d4975 ("fuse: use iomap for buffered writes") Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- fs/fuse/fuse_i.h | 8 ++++++++ fs/fuse/inode.c | 13 ++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ebee7e0b1cd3..5c569c3cb53f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1199,7 +1199,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else - blkbits = inode->i_sb->s_blocksize_bits; + blkbits = fc->blkbits; stat->blksize = 1 << blkbits; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1647eb7ca6fa..cc428d04be3e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -975,6 +975,14 @@ struct fuse_conn { /* Request timeout (in jiffies). 0 = no timeout */ unsigned int req_timeout; } timeout; + + /* + * This is a workaround until fuse uses iomap for reads. + * For fuseblk servers, this represents the blocksize passed in at + * mount time and for regular fuse servers, this is equivalent to + * inode->i_blkbits. + */ + u8 blkbits; }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3bfd83469d9f..7ddfd2b3cc9c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -292,7 +292,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (attr->blksize) fi->cached_i_blkbits = ilog2(attr->blksize); else - fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits; + fi->cached_i_blkbits = fc->blkbits; /* * Don't set the sticky bit in i_mode, unless we want the VFS @@ -1810,10 +1810,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -EINVAL; if (!sb_set_blocksize(sb, ctx->blksize)) goto err; + /* + * This is a workaround until fuse hooks into iomap for reads. + * Use PAGE_SIZE for the blocksize else if the writeback cache + * is enabled, buffered writes go through iomap and a read may + * overwrite partially written data if blocksize < PAGE_SIZE + */ + fc->blkbits = sb->s_blocksize_bits; + if (ctx->blksize != PAGE_SIZE && + !sb_set_blocksize(sb, PAGE_SIZE)) + goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; + fc->blkbits = sb->s_blocksize_bits; } sb->s_subtype = ctx->subtype; From 9d81ba6d49a7457784f0b6a71046818b86ec7e44 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 27 Aug 2025 09:45:55 +0800 Subject: [PATCH 044/450] fuse: Block access to folio overlimit syz reported a slab-out-of-bounds Write in fuse_dev_do_write. When the number of bytes to be retrieved is truncated to the upper limit by fc->max_pages and there is an offset, the oob is triggered. Add a loop termination condition to prevent overruns. Fixes: 3568a9569326 ("fuse: support large folios for retrieves") Reported-by: syzbot+2d215d165f9354b9c4ea@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2d215d165f9354b9c4ea Tested-by: syzbot+2d215d165f9354b9c4ea@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..5150aa25e64b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1893,7 +1893,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num) { + while (num && ap->num_folios < num_pages) { struct folio *folio; unsigned int folio_offset; unsigned int nr_bytes; From 131897c65e2b86cf14bec7379f44aa8fbb407526 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 24 Aug 2025 23:11:57 +0800 Subject: [PATCH 045/450] erofs: fix invalid algorithm for encoded extents The current algorithm sanity checks do not properly apply to new encoded extents. Unify the algorithm check with Z_EROFS_COMPRESSION(_RUNTIME)_MAX and ensure consistency with sbi->available_compr_algs. Reported-and-tested-by: syzbot+5a398eb460ddaa6f242f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/68a8bd20.050a0220.37038e.005a.GAE@google.com Fixes: 1d191b4ca51d ("erofs: implement encoded extent metadata") Thanks-to: Edward Adam Davis Signed-off-by: Gao Xiang --- fs/erofs/zmap.c | 67 +++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index a93efd95c555..798223e6da9c 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -394,10 +394,10 @@ static int z_erofs_map_blocks_fo(struct inode *inode, .map = map, .in_mbox = erofs_inode_in_metabox(inode), }; - int err = 0; - unsigned int endoff, afmt; + unsigned int endoff; unsigned long initial_lcn; unsigned long long ofs, end; + int err; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && @@ -482,20 +482,15 @@ static int z_erofs_map_blocks_fo(struct inode *inode, err = -EFSCORRUPTED; goto unmap_out; } - afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ? - Z_EROFS_COMPRESSION_INTERLACED : - Z_EROFS_COMPRESSION_SHIFTED; + if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER) + map->m_algorithmformat = Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + map->m_algorithmformat = vi->z_algorithmtype[1]; } else { - afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? - vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; - if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", - afmt, vi->nid); - err = -EFSCORRUPTED; - goto unmap_out; - } + map->m_algorithmformat = vi->z_algorithmtype[0]; } - map->m_algorithmformat = afmt; if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && @@ -626,9 +621,9 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; struct z_erofs_map_header *h; + erofs_off_t pos; + int err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { /* @@ -642,7 +637,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) return -ERESTARTSYS; - err = 0; if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) goto out_unlock; @@ -679,15 +673,6 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) vi->z_idata_size = le16_to_cpu(h->h_idata_size); - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_unlock; - } - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -726,6 +711,30 @@ static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map) return err; } +static int z_erofs_map_sanity_check(struct inode *inode, + struct erofs_map_blocks *map) +{ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + if (!(map->m_flags & EROFS_MAP_ENCODED)) + return 0; + if (unlikely(map->m_algorithmformat >= Z_EROFS_COMPRESSION_RUNTIME_MAX)) { + erofs_err(inode->i_sb, "unknown algorithm %d @ pos %llu for nid %llu, please upgrade kernel", + map->m_algorithmformat, map->m_la, EROFS_I(inode)->nid); + return -EOPNOTSUPP; + } + if (unlikely(map->m_algorithmformat < Z_EROFS_COMPRESSION_MAX && + !(sbi->available_compr_algs & (1 << map->m_algorithmformat)))) { + erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + map->m_algorithmformat, EROFS_I(inode)->nid); + return -EFSCORRUPTED; + } + if (unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || + map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) + return -EOPNOTSUPP; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { @@ -746,10 +755,8 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, else err = z_erofs_map_blocks_fo(inode, map, flags); } - if (!err && (map->m_flags & EROFS_MAP_ENCODED) && - unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || - map->m_llen > Z_EROFS_PCLUSTER_MAX_DSIZE)) - err = -EOPNOTSUPP; + if (!err) + err = z_erofs_map_sanity_check(inode, map); if (err) map->m_llen = 0; } From f544bf03a771ee746b908e9a08ecb97c65a35055 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 31 Jul 2025 10:35:14 +0200 Subject: [PATCH 046/450] mtd: MTD_INTEL_DG should depend on DRM_I915 or DRM_XE Intel Discrete Graphics non-volatile memory is only present on Intel discrete graphics devices, and its auxiliary device is instantiated by the Intel i915 and Xe2 DRM drivers. Hence add dependencies on DRM_I915 and DRM_XE, to prevent asking the user about this driver when configuring a kernel without Intel graphics support. Fixes: ceb5ab3cb6463795 ("mtd: add driver for intel graphics non-volatile memory device") Signed-off-by: Geert Uytterhoeven Signed-off-by: Miquel Raynal --- drivers/mtd/devices/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig index 46cebde79f34..e518dfeee654 100644 --- a/drivers/mtd/devices/Kconfig +++ b/drivers/mtd/devices/Kconfig @@ -185,8 +185,8 @@ config MTD_POWERNV_FLASH config MTD_INTEL_DG tristate "Intel Discrete Graphics non-volatile memory driver" - depends on AUXILIARY_BUS - depends on MTD + depends on AUXILIARY_BUS && MTD + depends on DRM_I915!=n || DRM_XE!=n || COMPILE_TEST help This provides an MTD device to access Intel Discrete Graphics non-volatile memory. From 1eae113dd5ff5192cfd3e11b6ab7b96193b42c01 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 30 Jul 2025 21:47:46 +0200 Subject: [PATCH 047/450] mtd: rawnand: nuvoton: Fix an error handling path in ma35_nand_chips_init() If a ma35_nand_chip_init() call fails, then a reference to 'nand_np' still needs to be released. Use for_each_child_of_node_scoped() to fix the issue. Fixes: 5abb5d414d55 ("mtd: rawnand: nuvoton: add new driver for the Nuvoton MA35 SoC") Signed-off-by: Christophe JAILLET Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c index c23b537948d5..1a285cd8fad6 100644 --- a/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c +++ b/drivers/mtd/nand/raw/nuvoton-ma35d1-nand-controller.c @@ -935,10 +935,10 @@ static void ma35_chips_cleanup(struct ma35_nand_info *nand) static int ma35_nand_chips_init(struct device *dev, struct ma35_nand_info *nand) { - struct device_node *np = dev->of_node, *nand_np; + struct device_node *np = dev->of_node; int ret; - for_each_child_of_node(np, nand_np) { + for_each_child_of_node_scoped(np, nand_np) { ret = ma35_nand_chip_init(dev, nand, nand_np); if (ret) { ma35_chips_cleanup(nand); From 513c40e59d5a414ab763a9c84797534b5e8c208d Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 12 Aug 2025 09:26:58 +0200 Subject: [PATCH 048/450] mtd: rawnand: stm32_fmc2: avoid overlapping mappings on ECC buffer Avoid below overlapping mappings by using a contiguous non-cacheable buffer. [ 4.077708] DMA-API: stm32_fmc2_nfc 48810000.nand-controller: cacheline tracking EEXIST, overlapping mappings aren't supported [ 4.089103] WARNING: CPU: 1 PID: 44 at kernel/dma/debug.c:568 add_dma_entry+0x23c/0x300 [ 4.097071] Modules linked in: [ 4.100101] CPU: 1 PID: 44 Comm: kworker/u4:2 Not tainted 6.1.82 #1 [ 4.106346] Hardware name: STMicroelectronics STM32MP257F VALID1 SNOR / MB1704 (LPDDR4 Power discrete) + MB1703 + MB1708 (SNOR MB1730) (DT) [ 4.118824] Workqueue: events_unbound deferred_probe_work_func [ 4.124674] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.131624] pc : add_dma_entry+0x23c/0x300 [ 4.135658] lr : add_dma_entry+0x23c/0x300 [ 4.139792] sp : ffff800009dbb490 [ 4.143016] x29: ffff800009dbb4a0 x28: 0000000004008022 x27: ffff8000098a6000 [ 4.150174] x26: 0000000000000000 x25: ffff8000099e7000 x24: ffff8000099e7de8 [ 4.157231] x23: 00000000ffffffff x22: 0000000000000000 x21: ffff8000098a6a20 [ 4.164388] x20: ffff000080964180 x19: ffff800009819ba0 x18: 0000000000000006 [ 4.171545] x17: 6361727420656e69 x16: 6c6568636163203a x15: 72656c6c6f72746e [ 4.178602] x14: 6f632d646e616e2e x13: ffff800009832f58 x12: 00000000000004ec [ 4.185759] x11: 00000000000001a4 x10: ffff80000988af58 x9 : ffff800009832f58 [ 4.192916] x8 : 00000000ffffefff x7 : ffff80000988af58 x6 : 80000000fffff000 [ 4.199972] x5 : 000000000000bff4 x4 : 0000000000000000 x3 : 0000000000000000 [ 4.207128] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000812d2c40 [ 4.214185] Call trace: [ 4.216605] add_dma_entry+0x23c/0x300 [ 4.220338] debug_dma_map_sg+0x198/0x350 [ 4.224373] __dma_map_sg_attrs+0xa0/0x110 [ 4.228411] dma_map_sg_attrs+0x10/0x2c [ 4.232247] stm32_fmc2_nfc_xfer.isra.0+0x1c8/0x3fc [ 4.237088] stm32_fmc2_nfc_seq_read_page+0xc8/0x174 [ 4.242127] nand_read_oob+0x1d4/0x8e0 [ 4.245861] mtd_read_oob_std+0x58/0x84 [ 4.249596] mtd_read_oob+0x90/0x150 [ 4.253231] mtd_read+0x68/0xac Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Fixes: 2cd457f328c1 ("mtd: rawnand: stm32_fmc2: add STM32 FMC2 NAND flash controller driver") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/stm32_fmc2_nand.c | 28 +++++++++----------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index a960403081f1..222c3c3b6848 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -272,6 +272,7 @@ struct stm32_fmc2_nfc { struct sg_table dma_data_sg; struct sg_table dma_ecc_sg; u8 *ecc_buf; + dma_addr_t dma_ecc_addr; int dma_ecc_len; u32 tx_dma_max_burst; u32 rx_dma_max_burst; @@ -902,17 +903,10 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) { /* Configure DMA ECC status */ - p = nfc->ecc_buf; for_each_sg(nfc->dma_ecc_sg.sgl, sg, eccsteps, s) { - sg_set_buf(sg, p, nfc->dma_ecc_len); - p += nfc->dma_ecc_len; - } - - ret = dma_map_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - if (!ret) { - ret = -EIO; - goto err_unmap_data; + sg_dma_address(sg) = nfc->dma_ecc_addr + + s * nfc->dma_ecc_len; + sg_dma_len(sg) = nfc->dma_ecc_len; } desc_ecc = dmaengine_prep_slave_sg(nfc->dma_ecc_ch, @@ -921,7 +915,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, DMA_PREP_INTERRUPT); if (!desc_ecc) { ret = -ENOMEM; - goto err_unmap_ecc; + goto err_unmap_data; } reinit_completion(&nfc->dma_ecc_complete); @@ -929,7 +923,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, desc_ecc->callback_param = &nfc->dma_ecc_complete; ret = dma_submit_error(dmaengine_submit(desc_ecc)); if (ret) - goto err_unmap_ecc; + goto err_unmap_data; dma_async_issue_pending(nfc->dma_ecc_ch); } @@ -949,7 +943,7 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, if (!write_data && !raw) dmaengine_terminate_all(nfc->dma_ecc_ch); ret = -ETIMEDOUT; - goto err_unmap_ecc; + goto err_unmap_data; } /* Wait DMA data transfer completion */ @@ -969,11 +963,6 @@ static int stm32_fmc2_nfc_xfer(struct nand_chip *chip, const u8 *buf, } } -err_unmap_ecc: - if (!write_data && !raw) - dma_unmap_sg(nfc->dev, nfc->dma_ecc_sg.sgl, - eccsteps, dma_data_dir); - err_unmap_data: dma_unmap_sg(nfc->dev, nfc->dma_data_sg.sgl, eccsteps, dma_data_dir); @@ -1610,7 +1599,8 @@ static int stm32_fmc2_nfc_dma_setup(struct stm32_fmc2_nfc *nfc) return ret; /* Allocate a buffer to store ECC status registers */ - nfc->ecc_buf = devm_kzalloc(nfc->dev, FMC2_MAX_ECC_BUF_LEN, GFP_KERNEL); + nfc->ecc_buf = dmam_alloc_coherent(nfc->dev, FMC2_MAX_ECC_BUF_LEN, + &nfc->dma_ecc_addr, GFP_KERNEL); if (!nfc->ecc_buf) return -ENOMEM; From 811c0da4542df3c065f6cb843ced68780e27bb44 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Tue, 12 Aug 2025 09:30:08 +0200 Subject: [PATCH 049/450] mtd: rawnand: stm32_fmc2: fix ECC overwrite In case OOB write is requested during a data write, ECC is currently lost. Avoid this issue by only writing in the free spare area. This issue has been seen with a YAFFS2 file system. Signed-off-by: Christophe Kerello Cc: stable@vger.kernel.org Fixes: 2cd457f328c1 ("mtd: rawnand: stm32_fmc2: add STM32 FMC2 NAND flash controller driver") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/stm32_fmc2_nand.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/stm32_fmc2_nand.c b/drivers/mtd/nand/raw/stm32_fmc2_nand.c index 222c3c3b6848..d957327fb4fa 100644 --- a/drivers/mtd/nand/raw/stm32_fmc2_nand.c +++ b/drivers/mtd/nand/raw/stm32_fmc2_nand.c @@ -985,9 +985,21 @@ static int stm32_fmc2_nfc_seq_write(struct nand_chip *chip, const u8 *buf, /* Write oob */ if (oob_required) { - ret = nand_change_write_column_op(chip, mtd->writesize, - chip->oob_poi, mtd->oobsize, - false); + unsigned int offset_in_page = mtd->writesize; + const void *buf = chip->oob_poi; + unsigned int len = mtd->oobsize; + + if (!raw) { + struct mtd_oob_region oob_free; + + mtd_ooblayout_free(mtd, 0, &oob_free); + offset_in_page += oob_free.offset; + buf += oob_free.offset; + len = oob_free.length; + } + + ret = nand_change_write_column_op(chip, offset_in_page, + buf, len, false); if (ret) return ret; } From fd779eac2d659668be4d3dbdac0710afd5d6db12 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 21 Aug 2025 14:00:57 +0200 Subject: [PATCH 050/450] mtd: nand: raw: atmel: Respect tAR, tCLR in read setup timing Having setup time 0 violates tAR, tCLR of some chips, for instance TOSHIBA TC58NVG2S3ETAI0 cannot be detected successfully (first ID byte being read duplicated, i.e. 98 98 dc 90 15 76 14 03 instead of 98 dc 90 15 76 ...). Atmel Application Notes postulated 1 cycle NRD_SETUP without explanation [1], but it looks more appropriate to just calculate setup time properly. [1] Link: https://ww1.microchip.com/downloads/aemDocuments/documents/MPU32/ApplicationNotes/ApplicationNotes/doc6255.pdf Cc: stable@vger.kernel.org Fixes: f9ce2eddf176 ("mtd: nand: atmel: Add ->setup_data_interface() hooks") Signed-off-by: Alexander Sverdlin Tested-by: Alexander Dahl Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/atmel/nand-controller.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/raw/atmel/nand-controller.c b/drivers/mtd/nand/raw/atmel/nand-controller.c index 84ab4a83cbd6..db94d14a3807 100644 --- a/drivers/mtd/nand/raw/atmel/nand-controller.c +++ b/drivers/mtd/nand/raw/atmel/nand-controller.c @@ -1377,14 +1377,24 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand, if (ret) return ret; + /* + * Read setup timing depends on the operation done on the NAND: + * + * NRD_SETUP = max(tAR, tCLR) + */ + timeps = max(conf->timings.sdr.tAR_min, conf->timings.sdr.tCLR_min); + ncycles = DIV_ROUND_UP(timeps, mckperiodps); + totalcycles += ncycles; + ret = atmel_smc_cs_conf_set_setup(smcconf, ATMEL_SMC_NRD_SHIFT, ncycles); + if (ret) + return ret; + /* * The read cycle timing is directly matching tRC, but is also * dependent on the setup and hold timings we calculated earlier, * which gives: * - * NRD_CYCLE = max(tRC, NRD_PULSE + NRD_HOLD) - * - * NRD_SETUP is always 0. + * NRD_CYCLE = max(tRC, NRD_SETUP + NRD_PULSE + NRD_HOLD) */ ncycles = DIV_ROUND_UP(conf->timings.sdr.tRC_min, mckperiodps); ncycles = max(totalcycles, ncycles); From 85941afd2c404247e583c827fae0a45da1c1d92c Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 25 Aug 2025 09:53:27 +0200 Subject: [PATCH 051/450] s390/pai: Deny all events not handled by this PMU Each PAI PMU device driver returns -EINVAL when an event is out of its accepted range. This return value aborts the search for an alternative PMU device driver to handle this event. Change the return value to -ENOENT. This return value is used to try other PMUs instead. This makes the PMUs more robust when the sequence of PMU device driver initialization changes (at boot time) or by using modules. Fixes: 39d62336f5c12 ("s390/pai: add support for cryptography counters") Acked-by: Sumanth Korikkar Signed-off-by: Thomas Richter Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 4 ++-- arch/s390/kernel/perf_pai_ext.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index f373a1009c45..9455f213dc20 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -285,10 +285,10 @@ static int paicrypt_event_init(struct perf_event *event) /* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */ if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type) return -ENOENT; - /* PAI crypto event must be in valid range */ + /* PAI crypto event must be in valid range, try others if not */ if (a->config < PAI_CRYPTO_BASE || a->config > PAI_CRYPTO_BASE + paicrypt_cnt) - return -EINVAL; + return -ENOENT; /* Allow only CRYPTO_ALL for sampling */ if (a->sample_period && a->config != PAI_CRYPTO_BASE) return -EINVAL; diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index d827473e7f87..7b32935273ce 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -265,7 +265,7 @@ static int paiext_event_valid(struct perf_event *event) event->hw.config_base = offsetof(struct paiext_cb, acc); return 0; } - return -EINVAL; + return -ENOENT; } /* Might be called on different CPU than the one the event is intended for. */ From ce971233242b5391d99442271f3ca096fb49818d Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 22 Aug 2025 14:05:57 +0200 Subject: [PATCH 052/450] s390/cpum_cf: Deny all sampling events by counter PMU Deny all sampling event by the CPUMF counter facility device driver and return -ENOENT. This return value is used to try other PMUs. Up to now events for type PERF_TYPE_HARDWARE were not tested for sampling and returned later on -EOPNOTSUPP. This ends the search for alternative PMUs. Change that behavior and try other PMUs instead. Fixes: 613a41b0d16e ("s390/cpum_cf: Reject request for sampling in event initialization") Acked-by: Sumanth Korikkar Signed-off-by: Thomas Richter Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_cpum_cf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 4d09954ebf49..04457d88e589 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -760,8 +760,6 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) break; case PERF_TYPE_HARDWARE: - if (is_sampling_event(event)) /* No sampling support */ - return -ENOENT; ev = attr->config; if (!attr->exclude_user && attr->exclude_kernel) { /* @@ -859,6 +857,8 @@ static int cpumf_pmu_event_init(struct perf_event *event) unsigned int type = event->attr.type; int err = -ENOENT; + if (is_sampling_event(event)) /* No sampling support */ + return err; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) err = __hw_perf_event_init(event, type); else if (event->pmu->type == type) From bb585591ebf00fb1f6a1fdd1ea96b5848bd9112d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 27 Aug 2025 21:43:09 +0200 Subject: [PATCH 053/450] fhandle: use more consistent rules for decoding file handle from userns Commit 620c266f39493 ("fhandle: relax open_by_handle_at() permission checks") relaxed the coditions for decoding a file handle from non init userns. The conditions are that that decoded dentry is accessible from the user provided mountfd (or to fs root) and that all the ancestors along the path have a valid id mapping in the userns. These conditions are intentionally more strict than the condition that the decoded dentry should be "lookable" by path from the mountfd. For example, the path /home/amir/dir/subdir is lookable by path from unpriv userns of user amir, because /home perms is 755, but the owner of /home does not have a valid id mapping in unpriv userns of user amir. The current code did not check that the decoded dentry itself has a valid id mapping in the userns. There is no security risk in that, because that final open still performs the needed permission checks, but this is inconsistent with the checks performed on the ancestors, so the behavior can be a bit confusing. Add the check for the decoded dentry itself, so that the entire path, including the last component has a valid id mapping in the userns. Fixes: 620c266f39493 ("fhandle: relax open_by_handle_at() permission checks") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/20250827194309.1259650-1-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/fhandle.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..a907ddfac4d5 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -207,6 +207,14 @@ static int vfs_dentry_acceptable(void *context, struct dentry *dentry) if (!ctx->flags) return 1; + /* + * Verify that the decoded dentry itself has a valid id mapping. + * In case the decoded dentry is the mountfd root itself, this + * verifies that the mountfd inode itself has a valid id mapping. + */ + if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) + return 0; + /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able From 31f1a960ad1a14def94fa0b8c25d62b4c032813f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:02:16 -0700 Subject: [PATCH 054/450] NFSv4: Don't clear capabilities that won't be reset Don't clear the capabilities that are not going to get reset by the call to _nfs4_server_capabilities(). Reported-by: Scott Haiden Fixes: b01f21cacde9 ("NFS: Fix the setting of capabilities when automounting a new filesystem") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d2b67e06cc3..5b92fcf45dd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4092,7 +4092,6 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) }; int err; - nfs_server_set_init_caps(server); do { err = nfs4_handle_exception(server, _nfs4_server_capabilities(server, fhandle), From dd5a8621b886b02f8341c5d4ea68eb2c552ebd3e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:07:22 -0700 Subject: [PATCH 055/450] NFSv4: Clear the NFS_CAP_FS_LOCATIONS flag if it is not set _nfs4_server_capabilities() is expected to clear any flags that are not supported by the server. Fixes: 8a59bb93b7e3 ("NFSv4 store server support for fs_location attribute") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5b92fcf45dd7..d0f91d9430f6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4013,8 +4013,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2]; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | - NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->caps &= + ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) From b3ac33436030bce37ecb3dcae581ecfaad28078c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:12:30 -0700 Subject: [PATCH 056/450] NFSv4: Clear NFS_CAP_OPEN_XOR and NFS_CAP_DELEGTIME if not supported _nfs4_server_capabilities() should clear capabilities that are not supported by the server. Fixes: d2a00cceb93a ("NFSv4: Detect support for OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d0f91d9430f6..ce61253efd45 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4015,7 +4015,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS | - NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS); + NFS_CAP_SECURITY_LABEL | NFS_CAP_FS_LOCATIONS | + NFS_CAP_OPEN_XOR | NFS_CAP_DELEGTIME); server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) From 4fb2b677fc1f70ee642c0beecc3cabf226ef5707 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 29 Aug 2025 09:15:12 -0700 Subject: [PATCH 057/450] NFSv4: Clear the NFS_CAP_XATTR flag if not supported by the server nfs_server_set_fsinfo() shouldn't assume that NFS_CAP_XATTR is unset on entry to the function. Fixes: b78ef845c35d ("NFSv4.2: query the server for extended attribute support") Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8fb4a950dd55..4e3dcc157a83 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -888,6 +888,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; + else + server->caps &= ~NFS_CAP_XATTR; #endif } From 8a68d64bb10334426834e8c273319601878e961e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 29 Aug 2025 15:28:52 -0700 Subject: [PATCH 058/450] x86/vmscape: Add old Intel CPUs to affected list These old CPUs are not tested against VMSCAPE, but are likely vulnerable. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen --- arch/x86/kernel/cpu/common.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2b87c93e6609..f98ec9c7fc07 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1240,15 +1240,18 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), - VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), From 3712ce9fa501617cdc4466d30ae3894d50887743 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Wed, 27 Aug 2025 19:58:42 +0200 Subject: [PATCH 059/450] gpiolib: acpi: Ignore touchpad wakeup on GPD G1619-05 Same issue as G1619-04 in commit 805c74eac8cb ("gpiolib: acpi: Ignore touchpad wakeup on GPD G1619-04"), Strix Point lineup uses 05. Signed-off-by: Antheas Kapenekakis Reviewed-by: Mika Westerberg Reviewed-by: Mario Limonciello Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-quirks.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi-quirks.c b/drivers/gpio/gpiolib-acpi-quirks.c index c13545dce349..bb63138c4b5b 100644 --- a/drivers/gpio/gpiolib-acpi-quirks.c +++ b/drivers/gpio/gpiolib-acpi-quirks.c @@ -317,6 +317,18 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "PNP0C50:00@8", }, }, + { + /* + * Same as G1619-04. New model. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GPD"), + DMI_MATCH(DMI_PRODUCT_NAME, "G1619-05"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "PNP0C50:00@8", + }, + }, { /* * Spurious wakeups from GPIO 11 From 5c5a41a75452e9eb5810a7d0d9916b61fe9fd1c5 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 00:39:40 +0200 Subject: [PATCH 060/450] gpu: nova-core: depend on CONFIG_64BIT If built on architectures with CONFIG_ARCH_DMA_ADDR_T_64BIT=y nova-core produces that following build failures: error[E0308]: mismatched types --> drivers/gpu/nova-core/fb.rs:49:59 | 49 | hal::fb_hal(chipset).write_sysmem_flush_page(bar, page.dma_handle())?; | ----------------------- ^^^^^^^^^^^^^^^^^ expected `u64`, found `u32` | | | arguments to this method are incorrect | note: method defined here --> drivers/gpu/nova-core/fb/hal.rs:19:8 | 19 | fn write_sysmem_flush_page(&self, bar: &Bar0, addr: u64) -> Result; | ^^^^^^^^^^^^^^^^^^^^^^^ help: you can convert a `u32` to a `u64` | 49 | hal::fb_hal(chipset).write_sysmem_flush_page(bar, page.dma_handle().into())?; | +++++++ error[E0308]: mismatched types --> drivers/gpu/nova-core/fb.rs:65:47 | 65 | if hal.read_sysmem_flush_page(bar) == self.page.dma_handle() { | ------------------------------- ^^^^^^^^^^^^^^^^^^^^^^ expected `u64`, found `u32` | | | expected because this is `u64` | help: you can convert a `u32` to a `u64` | 65 | if hal.read_sysmem_flush_page(bar) == self.page.dma_handle().into() { | +++++++ error: this arithmetic operation will overflow --> drivers/gpu/nova-core/falcon.rs:469:23 | 469 | .set_base((dma_start >> 40) as u16) | ^^^^^^^^^^^^^^^^^ attempt to shift right by `40_i32`, which would overflow | = note: `#[deny(arithmetic_overflow)]` on by default This is due to the code making assumptions on the width of dma_addr_t to be 64 bit. While this could technically be handled, it is rather painful to deal with, as the following example illustrates: pub(super) fn read_sysmem_flush_page_ga100(bar: &Bar0) -> DmaAddress { let addr = u64::from(regs::NV_PFB_NISO_FLUSH_SYSMEM_ADDR::read(bar).adr_39_08()) << FLUSH_SYSMEM_ADDR_SHIFT | u64::from(regs::NV_PFB_NISO_FLUSH_SYSMEM_ADDR_HI::read(bar).adr_63_40()) << FLUSH_SYSMEM_ADDR_SHIFT_HI; addr.try_into().unwrap_or_else(|_| { kernel::warn_on!(true); 0 }) } At the same time there's not much value for nova-core to support 32-bit, given that the supported GPU architectures are Turing and later, hence depend on CONFIG_64BIT. Cc: John Hubbard Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/lkml/20250828160247.37492-1-ojeda@kernel.org/ Fixes: 6554ad65b589 ("gpu: nova-core: register sysmem flush page") Fixes: 69f5cd67ce41 ("gpu: nova-core: add falcon register definitions and base code") Reviewed-by: Alexandre Courbot Reviewed-by: John Hubbard Link: https://lore.kernel.org/r/20250828223954.351348-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- drivers/gpu/nova-core/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/nova-core/Kconfig b/drivers/gpu/nova-core/Kconfig index 8726d80d6ba4..20d3e6d0d796 100644 --- a/drivers/gpu/nova-core/Kconfig +++ b/drivers/gpu/nova-core/Kconfig @@ -1,5 +1,6 @@ config NOVA_CORE tristate "Nova Core GPU driver" + depends on 64BIT depends on PCI depends on RUST depends on RUST_FW_LOADER_ABSTRACTIONS From f54d87dad7619c8026e95b848d6ef677b9f2b55f Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Mon, 1 Sep 2025 16:57:57 +0800 Subject: [PATCH 061/450] ASoC: rt712: avoid skipping the blind write Some devices might not use the DMIC function of the RT712VB. Therefore, this patch avoids skipping the blind write with RT712VB. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20250901085757.1287945-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt712-sdca.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/rt712-sdca.c b/sound/soc/codecs/rt712-sdca.c index 5b298db5f0f6..0ebaae426e73 100644 --- a/sound/soc/codecs/rt712-sdca.c +++ b/sound/soc/codecs/rt712-sdca.c @@ -1890,11 +1890,9 @@ int rt712_sdca_io_init(struct device *dev, struct sdw_slave *slave) rt712_sdca_va_io_init(rt712); } else { - if (!rt712->dmic_function_found) { - dev_err(&slave->dev, "%s RT712 VB detected but no SMART_MIC function exposed in ACPI\n", + if (!rt712->dmic_function_found) + dev_warn(&slave->dev, "%s RT712 VB detected but no SMART_MIC function exposed in ACPI\n", __func__); - goto suspend; - } /* multilanes and DMIC are supported by rt712vb */ prop->lane_control_support = true; From d05afb53c683ef7ed1228b593c3360f4d3126c58 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 21 Aug 2025 09:26:37 +0100 Subject: [PATCH 062/450] ASoC: wm8940: Correct PLL rate rounding Using a single value of 22500000 for both 48000Hz and 44100Hz audio will sometimes result in returning wrong dividers due to rounding. Update the code to use the actual value for both. Fixes: 294833fc9eb4 ("ASoC: wm8940: Rewrite code to set proper clocks") Reported-by: Ankur Tyagi Signed-off-by: Charles Keepax Tested-by: Ankur Tyagi Link: https://patch.msgid.link/20250821082639.1301453-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm8940.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm8940.c b/sound/soc/codecs/wm8940.c index 401ee20897b1..46c16c9bc17a 100644 --- a/sound/soc/codecs/wm8940.c +++ b/sound/soc/codecs/wm8940.c @@ -693,7 +693,12 @@ static int wm8940_update_clocks(struct snd_soc_dai *dai) f = wm8940_get_mclkdiv(priv->mclk, fs256, &mclkdiv); if (f != priv->mclk) { /* The PLL performs best around 90MHz */ - fpll = wm8940_get_mclkdiv(22500000, fs256, &mclkdiv); + if (fs256 % 8000) + f = 22579200; + else + f = 24576000; + + fpll = wm8940_get_mclkdiv(f, fs256, &mclkdiv); } wm8940_set_dai_pll(dai, 0, 0, priv->mclk, fpll); From b4799520dcd6fe1e14495cecbbe9975d847cd482 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 21 Aug 2025 09:26:38 +0100 Subject: [PATCH 063/450] ASoC: wm8940: Correct typo in control name Fixes: 0b5e92c5e020 ("ASoC WM8940 Driver") Reported-by: Ankur Tyagi Signed-off-by: Charles Keepax Tested-by: Ankur Tyagi Link: https://patch.msgid.link/20250821082639.1301453-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm8940.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm8940.c b/sound/soc/codecs/wm8940.c index 46c16c9bc17a..94873ea63014 100644 --- a/sound/soc/codecs/wm8940.c +++ b/sound/soc/codecs/wm8940.c @@ -220,7 +220,7 @@ static const struct snd_kcontrol_new wm8940_snd_controls[] = { SOC_SINGLE_TLV("Digital Capture Volume", WM8940_ADCVOL, 0, 255, 0, wm8940_adc_tlv), SOC_ENUM("Mic Bias Level", wm8940_mic_bias_level_enum), - SOC_SINGLE_TLV("Capture Boost Volue", WM8940_ADCBOOST, + SOC_SINGLE_TLV("Capture Boost Volume", WM8940_ADCBOOST, 8, 1, 0, wm8940_capture_boost_vol_tlv), SOC_SINGLE_TLV("Speaker Playback Volume", WM8940_SPKVOL, 0, 63, 0, wm8940_spk_vol_tlv), From 9b17d3724df55ecc2bc67978822585f2b023be48 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 21 Aug 2025 09:26:39 +0100 Subject: [PATCH 064/450] ASoC: wm8974: Correct PLL rate rounding Using a single value of 22500000 for both 48000Hz and 44100Hz audio will sometimes result in returning wrong dividers due to rounding. Update the code to use the actual value for both. Fixes: 51b2bb3f2568 ("ASoC: wm8974: configure pll and mclk divider automatically") Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250821082639.1301453-4-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/wm8974.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm8974.c b/sound/soc/codecs/wm8974.c index bdf437a5403f..db16d893a235 100644 --- a/sound/soc/codecs/wm8974.c +++ b/sound/soc/codecs/wm8974.c @@ -419,10 +419,14 @@ static int wm8974_update_clocks(struct snd_soc_dai *dai) fs256 = 256 * priv->fs; f = wm8974_get_mclkdiv(priv->mclk, fs256, &mclkdiv); - if (f != priv->mclk) { /* The PLL performs best around 90MHz */ - fpll = wm8974_get_mclkdiv(22500000, fs256, &mclkdiv); + if (fs256 % 8000) + f = 22579200; + else + f = 24576000; + + fpll = wm8974_get_mclkdiv(f, fs256, &mclkdiv); } wm8974_set_dai_pll(dai, 0, 0, priv->mclk, fpll); From a22d3b0d49d411e64ed07e30c2095035ecb30ed2 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 19 Aug 2025 08:56:22 +0200 Subject: [PATCH 065/450] phy: ti: gmii-sel: Always write the RGMII ID setting Some SoCs are just validated with the TX delay enabled. With commit ca13b249f291 ("net: ethernet: ti: am65-cpsw: fixup PHY mode for fixed RGMII TX delay"), the network driver will patch the delay setting on the fly assuming that the TX delay setting is fixed. In reality, the TX delay is configurable and just skipped in the documentation. There are bootloaders, which will disable the TX delay and this will lead to a transmit path which doesn't add any delays at all. Fix that by always writing the RGMII_ID setting and report an error for unsupported RGMII delay modes. This is safe to do and shouldn't break any boards in mainline because the fixed delay is only introduced for gmii-sel compatibles which are used together with the am65-cpsw-nuss driver and also contains the commit above. Fixes: ca13b249f291 ("net: ethernet: ti: am65-cpsw: fixup PHY mode for fixed RGMII TX delay") Signed-off-by: Michael Walle Reviewed-by: Maxime Chevallier Link: https://lore.kernel.org/r/20250819065622.1019537-1-mwalle@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-gmii-sel.c | 47 +++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index ff5d5e29629f..50adabb867cb 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -34,6 +34,7 @@ enum { PHY_GMII_SEL_PORT_MODE = 0, PHY_GMII_SEL_RGMII_ID_MODE, PHY_GMII_SEL_RMII_IO_CLK_EN, + PHY_GMII_SEL_FIXED_TX_DELAY, PHY_GMII_SEL_LAST, }; @@ -127,6 +128,11 @@ static int phy_gmii_sel_mode(struct phy *phy, enum phy_mode mode, int submode) goto unsupported; } + /* With a fixed delay, some modes are not supported at all. */ + if (soc_data->features & BIT(PHY_GMII_SEL_FIXED_TX_DELAY) && + rgmii_id != 0) + return -EINVAL; + if_phy->phy_if_mode = submode; dev_dbg(dev, "%s id:%u mode:%u rgmii_id:%d rmii_clk_ext:%d\n", @@ -210,25 +216,46 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_soc_dm814 = { static const struct reg_field phy_gmii_sel_fields_am654[][PHY_GMII_SEL_LAST] = { - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), }, - { [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), }, + { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x0, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x0, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x4, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x4, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x8, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x8, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0xC, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0xC, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x10, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x10, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x14, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x14, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x18, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x18, 4, 4), + }, { + [PHY_GMII_SEL_PORT_MODE] = REG_FIELD(0x1C, 0, 2), + [PHY_GMII_SEL_RGMII_ID_MODE] = REG_FIELD(0x1C, 4, 4), + }, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_soc_am654 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, }; static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), @@ -239,6 +266,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw5g_soc_j7200 = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII), .num_ports = 8, @@ -248,6 +277,8 @@ struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j721e = { static const struct phy_gmii_sel_soc_data phy_gmii_sel_cpsw9g_soc_j784s4 = { .use_of_data = true, + .features = BIT(PHY_GMII_SEL_RGMII_ID_MODE) | + BIT(PHY_GMII_SEL_FIXED_TX_DELAY), .regfields = phy_gmii_sel_fields_am654, .extra_modes = BIT(PHY_INTERFACE_MODE_QSGMII) | BIT(PHY_INTERFACE_MODE_SGMII) | BIT(PHY_INTERFACE_MODE_USXGMII), From 6cb8c1f957f674ca20b7d7c96b1f1bb11b83b679 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Thu, 21 Aug 2025 10:01:47 +0200 Subject: [PATCH 066/450] phy: qcom: qmp-pcie: Fix PHY initialization when powered down by firmware Commit 0cc22f5a861c ("phy: qcom: qmp-pcie: Add PHY register retention support") added support for using the "no_csr" reset to skip configuration of the PHY if the init sequence was already applied by the boot firmware. The expectation is that the PHY is only turned on/off by using the "no_csr" reset, instead of powering it down and re-programming it after a full reset. The boot firmware on X1E does not fully conform to this expectation: If the PCIe3 link fails to come up (e.g. because no PCIe card is inserted), the firmware powers down the PHY using the QPHY_PCS_POWER_DOWN_CONTROL register. The QPHY_START_CTRL register is kept as-is, so the driver assumes the PHY is already initialized and skips the configuration/power up sequence. The PHY won't come up again without clearing the QPHY_PCS_POWER_DOWN_CONTROL, so eventually initialization fails: qcom-qmp-pcie-phy 1be0000.phy: phy initialization timed-out phy phy-1be0000.phy.0: phy poweron failed --> -110 qcom-pcie 1bd0000.pcie: cannot initialize host qcom-pcie 1bd0000.pcie: probe with driver qcom-pcie failed with error -110 This can be reliably reproduced on the X1E CRD, QCP and Devkit when no card is inserted for PCIe3. Fix this by checking the QPHY_PCS_POWER_DOWN_CONTROL register in addition to QPHY_START_CTRL. If the PHY is powered down with the register, it doesn't conform to the expectations for using the "no_csr" reset, so we fully re-initialize with the normal reset sequence. Also check the register more carefully to ensure all of the bits we expect are actually set. A simple !!(readl()) is not enough, because the PHY might be only partially set up with some of the expected bits set. Cc: stable@vger.kernel.org Fixes: 0cc22f5a861c ("phy: qcom: qmp-pcie: Add PHY register retention support") Signed-off-by: Stephan Gerhold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250821-phy-qcom-qmp-pcie-nocsr-fix-v3-1-4898db0cc07c@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/qualcomm/phy-qcom-qmp-pcie.c | 25 ++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c index 95830dcfdec9..0fa63b734b67 100644 --- a/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c +++ b/drivers/phy/qualcomm/phy-qcom-qmp-pcie.c @@ -3067,6 +3067,14 @@ struct qmp_pcie { struct clk_fixed_rate aux_clk_fixed; }; +static bool qphy_checkbits(const void __iomem *base, u32 offset, u32 val) +{ + u32 reg; + + reg = readl(base + offset); + return (reg & val) == val; +} + static inline void qphy_setbits(void __iomem *base, u32 offset, u32 val) { u32 reg; @@ -4339,16 +4347,21 @@ static int qmp_pcie_init(struct phy *phy) struct qmp_pcie *qmp = phy_get_drvdata(phy); const struct qmp_phy_cfg *cfg = qmp->cfg; void __iomem *pcs = qmp->pcs; - bool phy_initialized = !!(readl(pcs + cfg->regs[QPHY_START_CTRL])); int ret; - qmp->skip_init = qmp->nocsr_reset && phy_initialized; /* - * We need to check the existence of init sequences in two cases: - * 1. The PHY doesn't support no_csr reset. - * 2. The PHY supports no_csr reset but isn't initialized by bootloader. - * As we can't skip init in these two cases. + * We can skip PHY initialization if all of the following conditions + * are met: + * 1. The PHY supports the nocsr_reset that preserves the PHY config. + * 2. The PHY was started (and not powered down again) by the + * bootloader, with all of the expected bits set correctly. + * In this case, we can continue without having the init sequence + * defined in the driver. */ + qmp->skip_init = qmp->nocsr_reset && + qphy_checkbits(pcs, cfg->regs[QPHY_START_CTRL], SERDES_START | PCS_START) && + qphy_checkbits(pcs, cfg->regs[QPHY_PCS_POWER_DOWN_CONTROL], cfg->pwrdn_ctrl); + if (!qmp->skip_init && !cfg->tbls.serdes_num) { dev_err(qmp->dev, "Init sequence not available\n"); return -ENODATA; From e63419dbf2ceb083c1651852209c7f048089ac0f Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Sat, 30 Aug 2025 11:49:53 +0200 Subject: [PATCH 067/450] dmaengine: ti: edma: Fix memory allocation size for queue_priority_map Fix a critical memory allocation bug in edma_setup_from_hw() where queue_priority_map was allocated with insufficient memory. The code declared queue_priority_map as s8 (*)[2] (pointer to array of 2 s8), but allocated memory using sizeof(s8) instead of the correct size. This caused out-of-bounds memory writes when accessing: queue_priority_map[i][0] = i; queue_priority_map[i][1] = i; The bug manifested as kernel crashes with "Oops - undefined instruction" on ARM platforms (BeagleBoard-X15) during EDMA driver probe, as the memory corruption triggered kernel hardening features on Clang. Change the allocation to use sizeof(*queue_priority_map) which automatically gets the correct size for the 2D array structure. Fixes: 2b6b3b742019 ("ARM/dmaengine: edma: Merge the two drivers under drivers/dma/") Signed-off-by: Anders Roxell Link: https://lore.kernel.org/r/20250830094953.3038012-1-anders.roxell@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/ti/edma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 3ed406f08c44..552be71db6c4 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -2064,8 +2064,8 @@ static int edma_setup_from_hw(struct device *dev, struct edma_soc_info *pdata, * priority. So Q0 is the highest priority queue and the last queue has * the lowest priority. */ - queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, sizeof(s8), - GFP_KERNEL); + queue_priority_map = devm_kcalloc(dev, ecc->num_tc + 1, + sizeof(*queue_priority_map), GFP_KERNEL); if (!queue_priority_map) return -ENOMEM; From 7e2368a21741e2db542330b32aa6fdd8908e7cff Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 28 Aug 2025 16:17:33 +0800 Subject: [PATCH 068/450] dma-debug: don't enforce dma mapping check on noncoherent allocations As discussed in [1], there is no need to enforce dma mapping check on noncoherent allocations, a simple test on the returned CPU address is good enough. Add a new pair of debug helpers and use them for noncoherent alloc/free to fix this issue. Fixes: efa70f2fdc84 ("dma-mapping: add a new dma_alloc_pages API") Link: https://lore.kernel.org/all/ff6c1fe6-820f-4e58-8395-df06aa91706c@oss.qualcomm.com # 1 Signed-off-by: Baochen Qiang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250828-dma-debug-fix-noncoherent-dma-check-v1-1-76e9be0dd7fc@oss.qualcomm.com --- kernel/dma/debug.c | 48 +++++++++++++++++++++++++++++++++++++++++++- kernel/dma/debug.h | 20 ++++++++++++++++++ kernel/dma/mapping.c | 4 ++-- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4..b82399437db0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..48757ca13f31 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251d..56de28a3b179 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); From e51bd0e595476c1527bb0b4def095a6fd16b2563 Mon Sep 17 00:00:00 2001 From: Xing Guo Date: Wed, 13 Aug 2025 11:16:47 +0800 Subject: [PATCH 069/450] selftests/fs/mount-notify: Fix compilation failure. Commit c6d9775c2066 ("selftests/fs/mount-notify: build with tools include dir") introduces the struct __kernel_fsid_t to decouple dependency with headers_install. The commit forgets to define a macro for __kernel_fsid_t and it will cause type re-definition issue. Signed-off-by: Xing Guo Link: https://lore.kernel.org/20250813031647.96411-1-higuoxing@gmail.com Acked-by: Amir Goldstein Closes: https://lore.kernel.org/oe-lkp/202508110628.65069d92-lkp@intel.com Signed-off-by: Christian Brauner --- .../mount-notify/mount-notify_test.c | 17 ++++++++--------- .../mount-notify/mount-notify_test_ns.c | 18 ++++++++---------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 63ce708d93ed..e4b7c2b457ee 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,20 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_cmds[] = { diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c index 090a5ca65004..9f57ca46e3af 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c @@ -2,6 +2,13 @@ // Copyright (c) 2025 Miklos Szeredi #define _GNU_SOURCE + +// Needed for linux/fanotify.h +typedef struct { + int val[2]; +} __kernel_fsid_t; +#define __kernel_fsid_t __kernel_fsid_t + #include #include #include @@ -10,21 +17,12 @@ #include #include #include +#include #include "../../kselftest_harness.h" -#include "../../pidfd/pidfd.h" #include "../statmount/statmount.h" #include "../utils.h" -// Needed for linux/fanotify.h -#ifndef __kernel_fsid_t -typedef struct { - int val[2]; -} __kernel_fsid_t; -#endif - -#include - static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; static const int mark_types[] = { From f1b55db08d527240fbc3b8c84229134a98d8d080 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Fri, 29 Aug 2025 21:57:42 +0200 Subject: [PATCH 070/450] rust: device: fix unresolved link to drm::Device drm::Device is only available when CONFIG_DRM=y, which we have to consider for intra-doc links, otherwise the rustdoc make target produces the following warning. >> warning: unresolved link to `kernel::drm::Device` --> rust/kernel/device.rs:154:22 | 154 | /// [`drm::Device`]: kernel::drm::Device | ^^^^^^^^^^^^^^^^^^^ no item named `drm` in module `kernel` | = note: `#[warn(rustdoc::broken_intra_doc_links)]` on by default Fix this by making the intra-doc link conditional on CONFIG_DRM being enabled. Fixes: d6e26c1ae4a6 ("device: rust: expand documentation for Device") Suggested-by: Alice Ryhl Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508261644.9LclwUgt-lkp@intel.com/ Reviewed-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250829195745.31174-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- rust/kernel/device.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 5902b3714a16..a1db49eb159a 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -138,7 +138,9 @@ /// } /// ``` /// -/// An example for a class device implementation is [`drm::Device`]. +/// An example for a class device implementation is +#[cfg_attr(CONFIG_DRM = "y", doc = "[`drm::Device`](kernel::drm::Device).")] +#[cfg_attr(not(CONFIG_DRM = "y"), doc = "`drm::Device`.")] /// /// # Invariants /// @@ -151,7 +153,6 @@ /// dropped from any thread. /// /// [`AlwaysRefCounted`]: kernel::types::AlwaysRefCounted -/// [`drm::Device`]: kernel::drm::Device /// [`impl_device_context_deref`]: kernel::impl_device_context_deref /// [`pci::Device`]: kernel::pci::Device /// [`platform::Device`]: kernel::platform::Device From aa2e1e4563d3ab689ffa86ca1412ecbf9fd3b308 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 2 Sep 2025 17:03:58 +0800 Subject: [PATCH 071/450] dmaengine: dw: dmamux: Fix device reference leak in rzn1_dmamux_route_allocate The reference taken by of_find_device_by_node() must be released when not needed anymore. Add missing put_device() call to fix device reference leaks. Fixes: 134d9c52fca2 ("dmaengine: dw: dmamux: Introduce RZN1 DMA router support") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20250902090358.2423285-1-linmq006@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/dw/rzn1-dmamux.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw/rzn1-dmamux.c b/drivers/dma/dw/rzn1-dmamux.c index 4fb8508419db..deadf135681b 100644 --- a/drivers/dma/dw/rzn1-dmamux.c +++ b/drivers/dma/dw/rzn1-dmamux.c @@ -48,12 +48,16 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, u32 mask; int ret; - if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) - return ERR_PTR(-EINVAL); + if (dma_spec->args_count != RNZ1_DMAMUX_NCELLS) { + ret = -EINVAL; + goto put_device; + } map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) - return ERR_PTR(-ENOMEM); + if (!map) { + ret = -ENOMEM; + goto put_device; + } chan = dma_spec->args[0]; map->req_idx = dma_spec->args[4]; @@ -94,12 +98,15 @@ static void *rzn1_dmamux_route_allocate(struct of_phandle_args *dma_spec, if (ret) goto clear_bitmap; + put_device(&pdev->dev); return map; clear_bitmap: clear_bit(map->req_idx, dmamux->used_chans); free_map: kfree(map); +put_device: + put_device(&pdev->dev); return ERR_PTR(ret); } From 78338108b5a856dc98223a335f147846a8a18c51 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 29 Aug 2025 15:57:34 +0300 Subject: [PATCH 072/450] ASoC: codec: sma1307: Fix memory corruption in sma1307_setting_loaded() The sma1307->set.header_size is how many integers are in the header (there are 8 of them) but instead of allocating space of 8 integers we allocate 8 bytes. This leads to memory corruption when we copy data it on the next line: memcpy(sma1307->set.header, data, sma1307->set.header_size * sizeof(int)); Also since we're immediately copying over the memory in ->set.header, there is no need to zero it in the allocator. Use devm_kmalloc_array() to allocate the memory instead. Fixes: 576c57e6b4c1 ("ASoC: sma1307: Add driver for Iron Device SMA1307") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/aLGjvjpueVstekXP@stanley.mountain Signed-off-by: Mark Brown --- sound/soc/codecs/sma1307.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/sma1307.c b/sound/soc/codecs/sma1307.c index 6a601e7134ea..b683e676640d 100644 --- a/sound/soc/codecs/sma1307.c +++ b/sound/soc/codecs/sma1307.c @@ -1737,9 +1737,10 @@ static void sma1307_setting_loaded(struct sma1307_priv *sma1307, const char *fil sma1307->set.checksum = data[sma1307->set.header_size - 2]; sma1307->set.num_mode = data[sma1307->set.header_size - 1]; num_mode = sma1307->set.num_mode; - sma1307->set.header = devm_kzalloc(sma1307->dev, - sma1307->set.header_size, - GFP_KERNEL); + sma1307->set.header = devm_kmalloc_array(sma1307->dev, + sma1307->set.header_size, + sizeof(int), + GFP_KERNEL); if (!sma1307->set.header) { sma1307->set.status = false; return; From f1d0260362d72f9f454dc1f9db2eeb80cb801f28 Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Thu, 21 Aug 2025 11:15:47 +0530 Subject: [PATCH 073/450] ASoC: amd: acp: Adjust pdm gain value Set pdm gain value by setting PDM_MISC_CTRL_MASK value. To avoid low pdm gain value. Signed-off-by: Venkata Prasad Potturu Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20250821054606.1279178-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/amd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/amd.h b/sound/soc/amd/acp/amd.h index cb8d97122f95..73a028e67246 100644 --- a/sound/soc/amd/acp/amd.h +++ b/sound/soc/amd/acp/amd.h @@ -130,7 +130,7 @@ #define PDM_DMA_INTR_MASK 0x10000 #define PDM_DEC_64 0x2 #define PDM_CLK_FREQ_MASK 0x07 -#define PDM_MISC_CTRL_MASK 0x10 +#define PDM_MISC_CTRL_MASK 0x18 #define PDM_ENABLE 0x01 #define PDM_DISABLE 0x00 #define DMA_EN_MASK 0x02 From 28edfaa10ca1b370b1a27fde632000d35c43402c Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Mon, 1 Sep 2025 16:15:07 +0100 Subject: [PATCH 074/450] ASoC: SDCA: Add quirk for incorrect function types for 3 systems Certain systems have CS42L43 DisCo that claims to conform to version 0.6.28 but uses the function types from the 1.0 spec. Add a quirk as a workaround. Closes: https://github.com/thesofproject/linux/issues/5515 Cc: stable@vger.kernel.org Signed-off-by: Maciej Strozek Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20250901151518.3197941-1-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca.h | 1 + sound/soc/sdca/sdca_device.c | 20 ++++++++++++++++++++ sound/soc/sdca/sdca_functions.c | 13 ++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/sound/sdca.h b/include/sound/sdca.h index 5a5d6de78d72..9c6a351c9d47 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -46,6 +46,7 @@ struct sdca_device_data { enum sdca_quirk { SDCA_QUIRKS_RT712_VB, + SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING, }; #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA) diff --git a/sound/soc/sdca/sdca_device.c b/sound/soc/sdca/sdca_device.c index 0244cdcdd109..4798ce2c8f0b 100644 --- a/sound/soc/sdca/sdca_device.c +++ b/sound/soc/sdca/sdca_device.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -55,11 +56,30 @@ static bool sdca_device_quirk_rt712_vb(struct sdw_slave *slave) return false; } +static bool sdca_device_quirk_skip_func_type_patching(struct sdw_slave *slave) +{ + const char *vendor, *sku; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + sku = dmi_get_system_info(DMI_PRODUCT_SKU); + + if (vendor && sku && + !strcmp(vendor, "Dell Inc.") && + (!strcmp(sku, "0C62") || !strcmp(sku, "0C63") || !strcmp(sku, "0C6B")) && + slave->sdca_data.interface_revision == 0x061c && + slave->id.mfg_id == 0x01fa && slave->id.part_id == 0x4243) + return true; + + return false; +} + bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk) { switch (quirk) { case SDCA_QUIRKS_RT712_VB: return sdca_device_quirk_rt712_vb(slave); + case SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING: + return sdca_device_quirk_skip_func_type_patching(slave); default: break; } diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c index f26f597dca9e..13f68f7b6dd6 100644 --- a/sound/soc/sdca/sdca_functions.c +++ b/sound/soc/sdca/sdca_functions.c @@ -90,6 +90,7 @@ static int find_sdca_function(struct acpi_device *adev, void *data) { struct fwnode_handle *function_node = acpi_fwnode_handle(adev); struct sdca_device_data *sdca_data = data; + struct sdw_slave *slave = container_of(sdca_data, struct sdw_slave, sdca_data); struct device *dev = &adev->dev; struct fwnode_handle *control5; /* used to identify function type */ const char *function_name; @@ -137,11 +138,13 @@ static int find_sdca_function(struct acpi_device *adev, void *data) return ret; } - ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type); - if (ret < 0) { - dev_err(dev, "SDCA version %#x invalid function type %d\n", - sdca_data->interface_revision, function_type); - return ret; + if (!sdca_device_quirk_match(slave, SDCA_QUIRKS_SKIP_FUNC_TYPE_PATCHING)) { + ret = patch_sdca_function_type(sdca_data->interface_revision, &function_type); + if (ret < 0) { + dev_err(dev, "SDCA version %#x invalid function type %d\n", + sdca_data->interface_revision, function_type); + return ret; + } } function_name = get_sdca_function_name(function_type); From 0c28431f6fe13f3a3be0978f79c1a7ae8a93d028 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Tue, 2 Sep 2025 13:21:00 +0300 Subject: [PATCH 075/450] ASoC: SOF: imx: Fix devm_ioremap_resource check devm_ioremap_resource does not return NULL on error but an error pointer so we need to use IS_ERR to check the return code. While at it also pass the error code to dev_err_probe to improve logging. Fixes: bc163baef570 ("ASoC: Use of_reserved_mem_region_to_resource() for "memory-region"") Signed-off-by: Daniel Baluta Link: https://patch.msgid.link/20250902102101.378809-1-daniel.baluta@nxp.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/imx/imx-common.c b/sound/soc/sof/imx/imx-common.c index f00b381cec3b..d66c198b861a 100644 --- a/sound/soc/sof/imx/imx-common.c +++ b/sound/soc/sof/imx/imx-common.c @@ -316,9 +316,9 @@ static int imx_parse_ioremap_memory(struct snd_sof_dev *sdev) } sdev->bar[blk_type] = devm_ioremap_resource(sdev->dev, res); - if (!sdev->bar[blk_type]) + if (IS_ERR(sdev->bar[blk_type])) return dev_err_probe(sdev->dev, - -ENOMEM, + PTR_ERR(sdev->bar[blk_type]), "failed to ioremap %s region\n", chip_info->memory[i].name); } From 35fc531a59694f24a2456569cf7d1a9c6436841c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 2 Sep 2025 13:06:39 +0100 Subject: [PATCH 076/450] ASoC: SOF: Intel: hda-stream: Fix incorrect variable used in error message The dev_err message is reporting an error about capture streams however it is using the incorrect variable num_playback instead of num_capture. Fix this by using the correct variable num_capture. Fixes: a1d1e266b445 ("ASoC: SOF: Intel: Add Intel specific HDA stream operations") Signed-off-by: Colin Ian King Acked-by: Peter Ujfalusi Link: https://patch.msgid.link/20250902120639.2626861-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/intel/hda-stream.c b/sound/soc/sof/intel/hda-stream.c index aa6b0247d5c9..a34f472ef175 100644 --- a/sound/soc/sof/intel/hda-stream.c +++ b/sound/soc/sof/intel/hda-stream.c @@ -890,7 +890,7 @@ int hda_dsp_stream_init(struct snd_sof_dev *sdev) if (num_capture >= SOF_HDA_CAPTURE_STREAMS) { dev_err(sdev->dev, "error: too many capture streams %d\n", - num_playback); + num_capture); return -EINVAL; } From 81ac63321eb936b1a1f7045b37674661f8ffb4a5 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:36:29 +0800 Subject: [PATCH 077/450] trace: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250805023630.335719-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..2ab283fd3032 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; From 3d62ab32df065e4a7797204a918f6489ddb8a237 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Tue, 19 Aug 2025 10:51:52 +0000 Subject: [PATCH 078/450] tracing: Fix tracing_marker may trigger page fault during preempt_disable Both tracing_mark_write and tracing_mark_raw_write call __copy_from_user_inatomic during preempt_disable. But in some case, __copy_from_user_inatomic may trigger page fault, and will call schedule() subtly. And if a task is migrated to other cpu, the following warning will be trigger: if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) An example can illustrate this issue: process flow CPU --------------------------------------------------------------------- tracing_mark_raw_write(): cpu:0 ... ring_buffer_lock_reserve(): cpu:0 ... cpu = raw_smp_processor_id() cpu:0 cpu_buffer = buffer->buffers[cpu] cpu:0 ... ... __copy_from_user_inatomic(): cpu:0 ... # page fault do_mem_abort(): cpu:0 ... # Call schedule schedule() cpu:0 ... # the task schedule to cpu1 __buffer_unlock_commit(): cpu:1 ... ring_buffer_unlock_commit(): cpu:1 ... cpu = raw_smp_processor_id() cpu:1 cpu_buffer = buffer->buffers[cpu] cpu:1 As shown above, the process will acquire cpuid twice and the return values are not the same. To fix this problem using copy_from_user_nofault instead of __copy_from_user_inatomic, as the former performs 'access_ok' before copying. Link: https://lore.kernel.org/20250819105152.2766363-1-luogengkun@huaweicloud.com Fixes: 656c7f0d2d2b ("tracing: Replace kmap with copy_from_user() in trace_marker writing") Signed-off-by: Luo Gengkun Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1..2f1ae6c0ee81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7209,7 +7209,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7306,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); From de134cb54c3a67644ff95b1c9bffe545e752c912 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 20 Aug 2025 14:52:05 -0700 Subject: [PATCH 079/450] btrfs: fix squota compressed stats leak The following workload on a squota enabled fs: btrfs subvol create mnt/subvol # ensure subvol extents get accounted sync btrfs qgroup create 1/1 mnt btrfs qgroup assign mnt/subvol 1/1 mnt btrfs qgroup delete mnt/subvol # make the cleaner thread run btrfs filesystem sync mnt sleep 1 btrfs filesystem sync mnt btrfs qgroup destroy 1/1 mnt will fail with EBUSY. The reason is that 1/1 does the quick accounting when we assign subvol to it, gaining its exclusive usage as excl and excl_cmpr. But then when we delete subvol, the decrement happens via record_squota_delta() which does not update excl_cmpr, as squotas does not make any distinction between compressed and normal extents. Thus, we increment excl_cmpr but never decrement it, and are unable to delete 1/1. The two possible fixes are to make squota always mirror excl and excl_cmpr or to make the fast accounting separately track the plain and cmpr numbers. The latter felt cleaner to me so that is what I opted for. Fixes: 1e0e9d5771c3 ("btrfs: add helper for recording simple quota deltas") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ccaa9a3cf1ce..da102da169fd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1455,6 +1455,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *qgroup; LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; + u64 num_bytes_cmpr = src->excl_cmpr; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1466,11 +1467,12 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup_list *glist; qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes_cmpr; WARN_ON(sign < 0 && qgroup->excl < num_bytes); + WARN_ON(sign < 0 && qgroup->excl_cmpr < num_bytes_cmpr); qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes_cmpr; if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); From 6db1df415d73fcad12134a54f97dc6c8a64ab181 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Mon, 25 Aug 2025 18:32:04 +0930 Subject: [PATCH 080/450] btrfs: accept and ignore compression level for lzo The compression level is meaningless for lzo, but before commit 3f093ccb95f30 ("btrfs: harden parsing of compression mount options"), it was silently ignored if passed. After that commit, passing a level with lzo fails to mount: BTRFS error: unrecognized compression value lzo:1 It seems reasonable for users to expect that lzo would permit a numeric level option, as all the other algos do, even though the kernel's implementation of LZO currently only supports a single level. Because it has always worked to pass a level, it seems likely to me that users in the real world are relying on doing so. This patch restores the old behavior, giving "lzo:N" the same semantics as all of the other compression algos. To be clear, silly variants like "lzo:one", "lzo:the_first_option", or "lzo:armageddon" also used to work. This isn't meant to suggest that any possible mis-interpretation of mount options that once worked must continue to work forever. This is an exceptional case where it makes sense to preserve compatibility, both because the mis-interpretation is reasonable, and because nothing tangible is sacrificed. Finally update btrfs_show_options() to ignore the level of LZO, as it is only the default level without any extra meaning. Fixes: 3f093ccb95f30 ("btrfs: harden parsing of compression mount options") Reviewed-by: Daniel Vacek Reviewed-by: Qu Wenruo Signed-off-by: Calvin Owens Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7f31f8bd63ba..e708faf1892f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,9 +299,12 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); - } else if (btrfs_match_compress_type(string, "lzo", false)) { + } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = 0; + ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, + string + 3); + if (string[3] == ':' && string[4]) + btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -1079,7 +1082,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); - if (info->compress_level) + if (info->compress_level && info->compress_type != BTRFS_COMPRESS_LZO) seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) From 9786531399a679fc2f4630d2c0a186205282ab2f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 22 Aug 2025 16:06:13 +0930 Subject: [PATCH 081/450] btrfs: fix corruption reading compressed range when block size is smaller than page size [BUG] With 64K page size (aarch64 with 64K page size config) and 4K btrfs block size, the following workload can easily lead to a corrupted read: mkfs.btrfs -f -s 4k $dev > /dev/null mount -o compress $dev $mnt xfs_io -f -c "pwrite -S 0xff 0 64k" $mnt/base > /dev/null echo "correct result:" od -Ad -t x1 $mnt/base xfs_io -f -c "reflink $mnt/base 32k 0 32k" \ -c "reflink $mnt/base 0 32k 32k" \ -c "pwrite -S 0xff 60k 4k" $mnt/new > /dev/null echo "incorrect result:" od -Ad -t x1 $mnt/new umount $mnt This shows the following result: correct result: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0065536 incorrect result: 0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0032768 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0061440 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0065536 Notice the zero in the range [32K, 60K), which is incorrect. [CAUSE] With extra trace printk, it shows the following events during od: (some unrelated info removed like CPU and context) od-3457 btrfs_do_readpage: enter r/i=5/258 folio=0(65536) prev_em_start=0000000000000000 The "r/i" is indicating the root and inode number. In our case the file "new" is using ino 258 from fs tree (root 5). Here notice the @prev_em_start pointer is NULL. This means the btrfs_do_readpage() is called from btrfs_read_folio(), not from btrfs_readahead(). od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=0 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=4096 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=8192 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=12288 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=16384 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=20480 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=24576 got em start=0 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=28672 got em start=0 len=32768 These above 32K blocks will be read from the first half of the compressed data extent. od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=32768 got em start=32768 len=32768 Note here there is no btrfs_submit_compressed_read() call. Which is incorrect now. Although both extent maps at 0 and 32K are pointing to the same compressed data, their offsets are different thus can not be merged into the same read. So this means the compressed data read merge check is doing something wrong. od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=36864 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=40960 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=45056 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=49152 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=53248 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=57344 got em start=32768 len=32768 od-3457 btrfs_do_readpage: r/i=5/258 folio=0(65536) cur=61440 skip uptodate od-3457 btrfs_submit_compressed_read: cb orig_bio: file off=0 len=61440 The function btrfs_submit_compressed_read() is only called at the end of folio read. The compressed bio will only have an extent map of range [0, 32K), but the original bio passed in is for the whole 64K folio. This will cause the decompression part to only fill the first 32K, leaving the rest untouched (aka, filled with zero). This incorrect compressed read merge leads to the above data corruption. There were similar problems that happened in the past, commit 808f80b46790 ("Btrfs: update fix for read corruption of compressed and shared extents") is doing pretty much the same fix for readahead. But that's back to 2015, where btrfs still only supports bs (block size) == ps (page size) cases. This means btrfs_do_readpage() only needs to handle a folio which contains exactly one block. Only btrfs_readahead() can lead to a read covering multiple blocks. Thus only btrfs_readahead() passes a non-NULL @prev_em_start pointer. With v5.15 kernel btrfs introduced bs < ps support. This breaks the above assumption that a folio can only contain one block. Now btrfs_read_folio() can also read multiple blocks in one go. But btrfs_read_folio() doesn't pass a @prev_em_start pointer, thus the existing bio force submission check will never be triggered. In theory, this can also happen for btrfs with large folios, but since large folio is still experimental, we don't need to bother it, thus only bs < ps support is affected for now. [FIX] Instead of passing @prev_em_start to do the proper compressed extent check, introduce one new member, btrfs_bio_ctrl::last_em_start, so that the existing bio force submission logic will always be triggered. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c953297aa89a..b21cb72835cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -111,6 +111,24 @@ struct btrfs_bio_ctrl { */ unsigned long submit_bitmap; struct readahead_control *ractl; + + /* + * The start offset of the last used extent map by a read operation. + * + * This is for proper compressed read merge. + * U64_MAX means we are starting the read and have made no progress yet. + * + * The current btrfs_bio_is_contig() only uses disk_bytenr as + * the condition to check if the read can be merged with previous + * bio, which is not correct. E.g. two file extents pointing to the + * same extent but with different offset. + * + * So here we need to do extra checks to only merge reads that are + * covered by the same extent map. + * Just extent_map::start will be enough, as they are unique + * inside the same inode. + */ + u64 last_em_start; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -909,7 +927,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -1019,12 +1037,11 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, * non-optimal behavior (submitting 2 bios for the same extent). */ if (compress_type != BTRFS_COMPRESS_NONE && - prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->start) + bio_ctrl->last_em_start != U64_MAX && + bio_ctrl->last_em_start != em->start) force_bio_submit = true; - if (prev_em_start) - *prev_em_start = em->start; + bio_ctrl->last_em_start = em->start; btrfs_free_extent_map(em); em = NULL; @@ -1238,12 +1255,15 @@ int btrfs_read_folio(struct file *file, struct folio *folio) const u64 start = folio_pos(folio); const u64 end = start + folio_size(folio) - 1; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, + .last_em_start = U64_MAX, + }; struct extent_map *em_cached = NULL; int ret; lock_extents_for_read(inode, start, end, &cached_state); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); btrfs_free_extent_map(em_cached); @@ -2583,7 +2603,8 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD, - .ractl = rac + .ractl = rac, + .last_em_start = U64_MAX, }; struct folio *folio; struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); @@ -2591,12 +2612,11 @@ void btrfs_readahead(struct readahead_control *rac) const u64 end = start + readahead_length(rac) - 1; struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; - u64 prev_em_start = (u64)-1; lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl); btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); From f6a6c280059c4ddc23e12e3de1b01098e240036f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 26 Aug 2025 11:24:38 -0700 Subject: [PATCH 082/450] btrfs: fix subvolume deletion lockup caused by inodes xarray race There is a race condition between inode eviction and inode caching that can cause a live struct btrfs_inode to be missing from the root->inodes xarray. Specifically, there is a window during evict() between the inode being unhashed and deleted from the xarray. If btrfs_iget() is called for the same inode in that window, it will be recreated and inserted into the xarray, but then eviction will delete the new entry, leaving nothing in the xarray: Thread 1 Thread 2 --------------------------------------------------------------- evict() remove_inode_hash() btrfs_iget_path() btrfs_iget_locked() btrfs_read_locked_inode() btrfs_add_inode_to_root() destroy_inode() btrfs_destroy_inode() btrfs_del_inode_from_root() __xa_erase In turn, this can cause issues for subvolume deletion. Specifically, if an inode is in this lost state, and all other inodes are evicted, then btrfs_del_inode_from_root() will call btrfs_add_dead_root() prematurely. If the lost inode has a delayed_node attached to it, then when btrfs_clean_one_deleted_snapshot() calls btrfs_kill_all_delayed_nodes(), it will loop forever because the delayed_nodes xarray will never become empty (unless memory pressure forces the inode out). We saw this manifest as soft lockups in production. Fix it by only deleting the xarray entry if it matches the given inode (using __xa_cmpxchg()). Fixes: 310b2f5d5a94 ("btrfs: use an xarray to track open inodes in a root") Cc: stable@vger.kernel.org # 6.11+ Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Co-authored-by: Leo Martins Signed-off-by: Leo Martins Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7..e7218e78bff4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5696,7 +5696,17 @@ static void btrfs_del_inode_from_root(struct btrfs_inode *inode) bool empty = false; xa_lock(&root->inodes); - entry = __xa_erase(&root->inodes, btrfs_ino(inode)); + /* + * This btrfs_inode is being freed and has already been unhashed at this + * point. It's possible that another btrfs_inode has already been + * allocated for the same inode and inserted itself into the root, so + * don't delete it in that case. + * + * Note that this shouldn't need to allocate memory, so the gfp flags + * don't really matter. + */ + entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL, + GFP_ATOMIC); if (entry == inode) empty = xa_empty(&root->inodes); xa_unlock(&root->inodes); From ad64c073c9a031850de1542e6e976b0249e7e650 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Wed, 3 Sep 2025 12:08:41 +0200 Subject: [PATCH 083/450] ALSA: docs: Remove 3rd person singular s in *to indicate* Fixes: 78811dd56def ("ALSA: docs: Add documents for recently changes in snd-usb-audio") Signed-off-by: Paul Menzel Link: https://patch.msgid.link/20250903100842.267194-1-pmenzel@molgen.mpg.de Signed-off-by: Takashi Iwai --- Documentation/sound/alsa-configuration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index 062b86522e4d..accaebbdd642 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -2293,7 +2293,7 @@ delayed_register notice the need. skip_validation Skip unit descriptor validation (default: no). - The option is used to ignores the validation errors with the hexdump + The option is used to ignore the validation errors with the hexdump of the unit descriptor instead of a driver probe error, so that we can check its details. quirk_flags From ba3319e5905710abe495b11a1aaf03ebb51d62e2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Tue, 26 Aug 2025 00:27:47 -0500 Subject: [PATCH 084/450] cpufreq/amd-pstate: Fix a regression leading to EPP 0 after resume During the suspend sequence the cached CPPC request is destroyed with the expectation that it's restored during resume. This assumption broke when the separate cache EPP variable was removed, and then it was broken again by commit 608a76b65288 ("cpufreq/amd-pstate: Add support for the "Requested CPU Min frequency" BIOS option") which explicitly set it to zero during suspend. Remove the invalidation and set the value during the suspend call to update limits so that the cached variable can be used to restore on resume. Fixes: 608a76b65288 ("cpufreq/amd-pstate: Add support for the "Requested CPU Min frequency" BIOS option") Fixes: b7a41156588a ("cpufreq/amd-pstate: Invalidate cppc_req_cached during suspend") Reported-by: goldens Closes: https://community.frame.work/t/increased-power-usage-after-resuming-from-suspend-on-ryzen-7040-kernel-6-15-regression/ Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2391221 Tested-by: goldens Tested-by: Willian Wang Reported-by: Vincent Mauirn Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219981 Tested-by: Alex De Lorenzo Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250826052747.2240670-1-superm1@kernel.org Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5cd91489fcbe..b4c79fde1979 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1628,13 +1628,14 @@ static int amd_pstate_suspend(struct cpufreq_policy *policy) * min_perf value across kexec reboots. If this CPU is just resumed back without kexec, * the limits, epp and desired perf will get reset to the cached values in cpudata struct */ - ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + ret = amd_pstate_update_perf(policy, perf.bios_min_perf, + FIELD_GET(AMD_CPPC_DES_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached), + false); if (ret) return ret; - /* invalidate to ensure it's rewritten during resume */ - cpudata->cppc_req_cached = 0; - /* set this flag to avoid setting core offline*/ cpudata->suspended = true; From 3254959b4dd065eae396cf78ccc1361460b2f53e Mon Sep 17 00:00:00 2001 From: Syed Saba Kareem Date: Wed, 3 Sep 2025 22:47:47 +0530 Subject: [PATCH 085/450] ASoC: amd: amd_sdw: Add quirks for some new Dell laptops Add a quirk to include the codec amplifier function for Dell SKU's listed in quirk table. Note: In these SKU's, the RT722 codec amplifier is excluded, and an external amplifier is used instead. Signed-off-by: Syed Saba Kareem Message-ID: <20250903171817.2549507-1-syed.sabakareem@amd.com> Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-sdw-legacy-mach.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sound/soc/amd/acp/acp-sdw-legacy-mach.c b/sound/soc/amd/acp/acp-sdw-legacy-mach.c index c2197b75a7dd..5a3cfedacbaf 100644 --- a/sound/soc/amd/acp/acp-sdw-legacy-mach.c +++ b/sound/soc/amd/acp/acp-sdw-legacy-mach.c @@ -79,6 +79,22 @@ static const struct dmi_system_id soc_sdw_quirk_table[] = { }, .driver_data = (void *)(ASOC_SDW_CODEC_SPKR), }, + { + .callback = soc_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), + DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0DD3"), + }, + .driver_data = (void *)(ASOC_SDW_CODEC_SPKR), + }, + { + .callback = soc_sdw_quirk_cb, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), + DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0DD4"), + }, + .driver_data = (void *)(ASOC_SDW_CODEC_SPKR), + }, {} }; From d5067034725b1a0b2c785cea9cfce68776a94042 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 1 Sep 2025 10:31:08 +0200 Subject: [PATCH 086/450] Revert "drm/nouveau: Remove waitque for sched teardown" This reverts: commit bead88002227 ("drm/nouveau: Remove waitque for sched teardown") commit 5f46f5c7af8c ("drm/nouveau: Add new callback for scheduler teardown") from the drm/sched teardown leak fix series: https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta@kernel.org/ The aforementioned series removed a blocking waitqueue from nouveau_sched_fini(). It was mistakenly assumed that this waitqueue only prevents jobs from leaking, which the series fixed. The waitqueue, however, also guarantees that all VM_BIND related jobs are finished in order, cleaning up mappings in the GPU's MMU. These jobs must be executed sequentially. Without the waitqueue, this is no longer guaranteed, because entity and scheduler teardown can race with each other. Revert all patches related to the waitqueue removal. Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched teardown") Suggested-by: Danilo Krummrich Signed-off-by: Philipp Stanner Link: https://lore.kernel.org/r/20250901083107.10206-2-phasta@kernel.org Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nouveau_fence.c | 15 ----------- drivers/gpu/drm/nouveau/nouveau_fence.h | 1 - drivers/gpu/drm/nouveau/nouveau_sched.c | 35 ++++++++++--------------- drivers/gpu/drm/nouveau/nouveau_sched.h | 9 ++++--- drivers/gpu/drm/nouveau/nouveau_uvmm.c | 8 +++--- 5 files changed, 24 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index 9f345a008717..869d4335c0f4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence) return ret; } -void -nouveau_fence_cancel(struct nouveau_fence *fence) -{ - struct nouveau_fence_chan *fctx = nouveau_fctx(fence); - unsigned long flags; - - spin_lock_irqsave(&fctx->lock, flags); - if (!dma_fence_is_signaled_locked(&fence->base)) { - dma_fence_set_error(&fence->base, -ECANCELED); - if (nouveau_fence_signal(fence)) - nvif_event_block(&fctx->event); - } - spin_unlock_irqrestore(&fctx->lock, flags); -} - bool nouveau_fence_done(struct nouveau_fence *fence) { diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h index 9957a919bd38..183dd43ecfff 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.h +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **); int nouveau_fence_emit(struct nouveau_fence *); bool nouveau_fence_done(struct nouveau_fence *); -void nouveau_fence_cancel(struct nouveau_fence *fence); int nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr); int nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c index 0cc0bc9f9952..e60f7892f5ce 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.c +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c @@ -11,7 +11,6 @@ #include "nouveau_exec.h" #include "nouveau_abi16.h" #include "nouveau_sched.h" -#include "nouveau_chan.h" #define NOUVEAU_SCHED_JOB_TIMEOUT_MS 10000 @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job) { struct nouveau_sched *sched = job->sched; - spin_lock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); list_del(&job->entry); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); + + wake_up(&sched->job.wq); } void @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job) } /* Submit was successful; add the job to the schedulers job list. */ - spin_lock(&sched->job_list.lock); - list_add(&job->entry, &sched->job_list.head); - spin_unlock(&sched->job_list.lock); + spin_lock(&sched->job.list.lock); + list_add(&job->entry, &sched->job.list.head); + spin_unlock(&sched->job.list.lock); drm_sched_job_arm(&job->base); job->done_fence = dma_fence_get(&job->base.s_fence->finished); @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job) nouveau_job_fini(job); } -static void -nouveau_sched_cancel_job(struct drm_sched_job *sched_job) -{ - struct nouveau_fence *fence; - struct nouveau_job *job; - - job = to_nouveau_job(sched_job); - fence = to_nouveau_fence(job->done_fence); - - nouveau_fence_cancel(fence); -} - static const struct drm_sched_backend_ops nouveau_sched_ops = { .run_job = nouveau_sched_run_job, .timedout_job = nouveau_sched_timedout_job, .free_job = nouveau_sched_free_job, - .cancel_job = nouveau_sched_cancel_job, }; static int @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm, goto fail_sched; mutex_init(&sched->mutex); - spin_lock_init(&sched->job_list.lock); - INIT_LIST_HEAD(&sched->job_list.head); + spin_lock_init(&sched->job.list.lock); + INIT_LIST_HEAD(&sched->job.list.head); + init_waitqueue_head(&sched->job.wq); return 0; @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, return 0; } + static void nouveau_sched_fini(struct nouveau_sched *sched) { struct drm_gpu_scheduler *drm_sched = &sched->base; struct drm_sched_entity *entity = &sched->entity; + rmb(); /* for list_empty to work without lock */ + wait_event(sched->job.wq, list_empty(&sched->job.list.head)); + drm_sched_entity_fini(entity); drm_sched_fini(drm_sched); diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h index b98c3f0bef30..20cd1da8db73 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.h +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h @@ -103,9 +103,12 @@ struct nouveau_sched { struct mutex mutex; struct { - struct list_head head; - spinlock_t lock; - } job_list; + struct { + struct list_head head; + spinlock_t lock; + } list; + struct wait_queue_head wq; + } job; }; int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm, diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c b/drivers/gpu/drm/nouveau/nouveau_uvmm.c index ddfc46bc1b3e..48f105239f42 100644 --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) u64 end = addr + range; again: - spin_lock(&sched->job_list.lock); - list_for_each_entry(__job, &sched->job_list.head, entry) { + spin_lock(&sched->job.list.lock); + list_for_each_entry(__job, &sched->job.list.head, entry) { struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job); list_for_each_op(op, &bind_job->ops) { @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) if (!(end <= op_addr || addr >= op_end)) { nouveau_uvmm_bind_job_get(bind_job); - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); wait_for_completion(&bind_job->complete); nouveau_uvmm_bind_job_put(bind_job); goto again; @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range) } } } - spin_unlock(&sched->job_list.lock); + spin_unlock(&sched->job.list.lock); } static int From 394bfac1c7f7b701c2c93834c5761b9c9ceeebcf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 22 Aug 2025 06:33:18 +0000 Subject: [PATCH 087/450] mm/khugepaged: fix the address passed to notifier on testing young Commit 8ee53820edfd ("thp: mmu_notifier_test_young") introduced mmu_notifier_test_young(), but we are passing the wrong address. In xxx_scan_pmd(), the actual iteration address is "_address" not "address". We seem to misuse the variable on the very beginning. Change it to the right one. [akpm@linux-foundation.org fix whitespace, per everyone] Link: https://lkml.kernel.org/r/20250822063318.11644-1-richard.weiyang@gmail.com Fixes: 8ee53820edfd ("thp: mmu_notifier_test_young") Signed-off-by: Wei Yang Reviewed-by: Dev Jain Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Barry Song Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..b486c1d19b2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } if (!writable) { From 397f6d14f9c370e4910e6885294c340f39dedbf5 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 27 Jun 2025 20:57:47 +0800 Subject: [PATCH 088/450] mm/memory_hotplug: fix hwpoisoned large folio handling in do_migrate_range() In do_migrate_range(), the hwpoisoned folio may be large folio, which can't be handled by unmap_poisoned_folio(). I can reproduce this issue in qemu after adding delay in memory_failure() BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:try_to_unmap_one+0x16a/0xfc0 rmap_walk_anon+0xda/0x1f0 try_to_unmap+0x78/0x80 ? __pfx_try_to_unmap_one+0x10/0x10 ? __pfx_folio_not_mapped+0x10/0x10 ? __pfx_folio_lock_anon_vma_read+0x10/0x10 unmap_poisoned_folio+0x60/0x140 do_migrate_range+0x4d1/0x600 ? slab_memory_callback+0x6a/0x190 ? notifier_call_chain+0x56/0xb0 offline_pages+0x3e6/0x460 memory_subsys_offline+0x130/0x1f0 device_offline+0xba/0x110 acpi_bus_offline+0xb7/0x130 acpi_scan_hot_remove+0x77/0x290 acpi_device_hotplug+0x1e0/0x240 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x186/0x340 Besides, do_migrate_range() may be called between memory_failure set hwpoison flag and isolate the folio from lru, so remove WARN_ON(). In other places, unmap_poisoned_folio() is called when the folio is isolated, obey it in do_migrate_range() too. [david@redhat.com: don't abort offlining, fixed typo, add comment] Link: https://lkml.kernel.org/r/3c214dff-9649-4015-840f-10de0e03ebe4@redhat.com Fixes: b15c87263a69 ("hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined") Signed-off-by: Jinjiang Tu Signed-off-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Miaohe Lin Cc: Kefeng Wang Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc3..74318c787715 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); From 669602b5b7386e4fa00fc67b045ca3fd816e685d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 24 Aug 2025 16:07:59 +0300 Subject: [PATCH 089/450] init/main.c: fix boot time tracing crash Steven Rostedt reported a crash with "ftrace=function" kernel command line: [ 0.159269] BUG: kernel NULL pointer dereference, address: 000000000000001c [ 0.160254] #PF: supervisor read access in kernel mode [ 0.160975] #PF: error_code(0x0000) - not-present page [ 0.161697] PGD 0 P4D 0 [ 0.162055] Oops: Oops: 0000 [#1] SMP PTI [ 0.162619] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc2-test-00006-g48d06e78b7cb-dirty #9 PREEMPT(undef) [ 0.164141] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 0.165439] RIP: 0010:kmem_cache_alloc_noprof (mm/slub.c:4237) [ 0.166186] Code: 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 e4 f0 48 83 ec 20 8b 05 c9 b6 7e 01 <44> 8b 77 1c 65 4c 8b 2d b5 ea 20 02 4c 89 6c 24 18 41 89 f5 21 f0 [ 0.168811] RSP: 0000:ffffffffb2e03b30 EFLAGS: 00010086 [ 0.169545] RAX: 0000000001fff33f RBX: 0000000000000000 RCX: 0000000000000000 [ 0.170544] RDX: 0000000000002800 RSI: 0000000000002800 RDI: 0000000000000000 [ 0.171554] RBP: ffffffffb2e03b80 R08: 0000000000000004 R09: ffffffffb2e03c90 [ 0.172549] R10: ffffffffb2e03c90 R11: 0000000000000000 R12: 0000000000000000 [ 0.173544] R13: ffffffffb2e03c90 R14: ffffffffb2e03c90 R15: 0000000000000001 [ 0.174542] FS: 0000000000000000(0000) GS:ffff9d2808114000(0000) knlGS:0000000000000000 [ 0.175684] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.176486] CR2: 000000000000001c CR3: 000000007264c001 CR4: 00000000000200b0 [ 0.177483] Call Trace: [ 0.177828] [ 0.178123] mas_alloc_nodes (lib/maple_tree.c:176 (discriminator 2) lib/maple_tree.c:1255 (discriminator 2)) [ 0.178692] mas_store_gfp (lib/maple_tree.c:5468) [ 0.179223] execmem_cache_add_locked (mm/execmem.c:207) [ 0.179870] execmem_alloc (mm/execmem.c:213 mm/execmem.c:313 mm/execmem.c:335 mm/execmem.c:475) [ 0.180397] ? ftrace_caller (arch/x86/kernel/ftrace_64.S:169) [ 0.180922] ? __pfx_ftrace_caller (arch/x86/kernel/ftrace_64.S:158) [ 0.181517] execmem_alloc_rw (mm/execmem.c:487) [ 0.182052] arch_ftrace_update_trampoline (arch/x86/kernel/ftrace.c:266 arch/x86/kernel/ftrace.c:344 arch/x86/kernel/ftrace.c:474) [ 0.182778] ? ftrace_caller_op_ptr (arch/x86/kernel/ftrace_64.S:182) [ 0.183388] ftrace_update_trampoline (kernel/trace/ftrace.c:7947) [ 0.184024] __register_ftrace_function (kernel/trace/ftrace.c:368) [ 0.184682] ftrace_startup (kernel/trace/ftrace.c:3048) [ 0.185205] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.185877] register_ftrace_function_nolock (kernel/trace/ftrace.c:8717) [ 0.186595] register_ftrace_function (kernel/trace/ftrace.c:8745) [ 0.187254] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.187924] function_trace_init (kernel/trace/trace_functions.c:170) [ 0.188499] tracing_set_tracer (kernel/trace/trace.c:5916 kernel/trace/trace.c:6349) [ 0.189088] register_tracer (kernel/trace/trace.c:2391) [ 0.189642] early_trace_init (kernel/trace/trace.c:11075 kernel/trace/trace.c:11149) [ 0.190204] start_kernel (init/main.c:970) [ 0.190732] x86_64_start_reservations (arch/x86/kernel/head64.c:307) [ 0.191381] x86_64_start_kernel (??:?) [ 0.191955] common_startup_64 (arch/x86/kernel/head_64.S:419) [ 0.192534] [ 0.192839] Modules linked in: [ 0.193267] CR2: 000000000000001c [ 0.193730] ---[ end trace 0000000000000000 ]--- The crash happens because on x86 ftrace allocations from execmem require maple tree to be initialized. Move maple tree initialization that depends only on slab availability earlier in boot so that it will happen right after mm_core_init(). Link: https://lkml.kernel.org/r/20250824130759.1732736-1-rppt@kernel.org Fixes: 5d79c2be5081 ("x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations") Signed-off-by: Mike Rapoport (Microsoft) Reported-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Closes: https://lore.kernel.org/all/20250820184743.0302a8b5@gandalf.local.home/ Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Liam R. Howlett Cc: Borislav Betkov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..5753e9539ae6 100644 --- a/init/main.c +++ b/init/main.c @@ -956,6 +956,7 @@ void start_kernel(void) sort_main_extable(); trap_init(); mm_core_init(); + maple_tree_init(); poking_init(); ftrace_init(); @@ -973,7 +974,6 @@ void start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); - maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound From 21cc2b5c5062a256ae9064442d37ebbc23f5aef7 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 24 Aug 2025 03:21:15 +0900 Subject: [PATCH 090/450] mm/hugetlb: add missing hugetlb_lock in __unmap_hugepage_range() When restoring a reservation for an anonymous page, we need to check to freeing a surplus. However, __unmap_hugepage_range() causes data race because it reads h->surplus_huge_pages without the protection of hugetlb_lock. And adjust_reservation is a boolean variable that indicates whether reservations for anonymous pages in each folio should be restored. Therefore, it should be initialized to false for each round of the loop. However, this variable is not initialized to false except when defining the current adjust_reservation variable. This means that once adjust_reservation is set to true even once within the loop, reservations for anonymous pages will be restored unconditionally in all subsequent rounds, regardless of the folio's state. To fix this, we need to add the missing hugetlb_lock, unlock the page_table_lock earlier so that we don't lock the hugetlb_lock inside the page_table_lock lock, and initialize adjust_reservation to false on each round within the loop. Link: https://lkml.kernel.org/r/20250823182115.1193563-1-aha310510@gmail.com Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed") Signed-off-by: Jeongjun Park Reported-by: syzbot+417aeb05fd190f3a6da9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9 Reviewed-by: Sidhartha Kumar Cc: Breno Leitao Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c718..eed59cfb5d21 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5851,7 +5851,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5952,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the From ce652aac9c90a96c6536681d17518efb1f660fb8 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Fri, 22 Aug 2025 11:50:57 +0900 Subject: [PATCH 091/450] mm/damon/core: set quota->charged_from to jiffies at first charge window Kernel initializes the "jiffies" timer as 5 minutes below zero, as shown in include/linux/jiffies.h /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) And jiffies comparison help functions cast unsigned value to signed to cover wraparound #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) When quota->charged_from is initialized to 0, time_after_eq() can incorrectly return FALSE even after reset_interval has elapsed. This occurs when (jiffies - reset_interval) produces a value with MSB=1, which is interpreted as negative in signed arithmetic. This issue primarily affects 32-bit systems because: On 64-bit systems: MSB=1 values occur after ~292 million years from boot (assuming HZ=1000), almost impossible. On 32-bit systems: MSB=1 values occur during the first 5 minutes after boot, and the second half of every jiffies wraparound cycle, starting from day 25 (assuming HZ=1000) When above unexpected FALSE return from time_after_eq() occurs, the charging window will not reset. The user impact depends on esz value at that time. If esz is 0, scheme ignores configured quotas and runs without any limits. If esz is not 0, scheme stops working once the quota is exhausted. It remains until the charging window finally resets. So, change quota->charged_from to jiffies at damos_adjust_quota() when it is considered as the first charge window. By this change, we can avoid unexpected FALSE return from time_after_eq() Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com Fixes: 2b8a248d5873 ("mm/damon/schemes: implement size quota for schemes application speed control") # 5.16 Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 106ee8b0f2d5..c2e0b469fd43 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2111,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) + quota->charged_from = jiffies; + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { From 711f19dfd783ffb37ca4324388b9c4cb87e71363 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:57 +0800 Subject: [PATCH 092/450] mm/damon/lru_sort: avoid divide-by-zero in damon_lru_sort_apply_parameters() Patch series "mm/damon: avoid divide-by-zero in DAMON module's parameters application". DAMON's RECLAIM and LRU_SORT modules perform no validation on user-configured parameters during application, which may lead to division-by-zero errors. Avoid the divide-by-zero by adding validation checks when DAMON modules attempt to apply the parameters. This patch (of 2): During the calculation of 'hot_thres' and 'cold_thres', either 'sample_interval' or 'aggr_interval' is used as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Additionally, since 'aggr_interval' is already required to be set no smaller than 'sample_interval' in damon_set_attrs(), only the case where 'sample_interval' is zero needs to be checked. Link: https://lkml.kernel.org/r/20250827115858.1186261-2-yanquanmin1@huawei.com Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b..b5a5ed16a7a5 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) goto out; From e6b543ca9806d7bced863f43020e016ee996c057 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:58 +0800 Subject: [PATCH 093/450] mm/damon/reclaim: avoid divide-by-zero in damon_reclaim_apply_parameters() When creating a new scheme of DAMON_RECLAIM, the calculation of 'min_age_region' uses 'aggr_interval' as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Link: https://lkml.kernel.org/r/20250827115858.1186261-3-yanquanmin1@huawei.com Fixes: f5a79d7c0c87 ("mm/damon: introduce struct damos_access_pattern") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.1+] Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b4596676..fb7c982a0018 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; From 04d3cd43700a2d0fe4bfb1012a8ec7f2e34a3507 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:21 -0700 Subject: [PATCH 094/450] arm64: kexec: initialize kexec_buf struct in load_other_segments() Patch series "kexec: Fix invalid field access". The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. An initial fix was already landed for arm64[0], and this patchset fixes the problem on the remaining arm64 code and on riscv, as raised by Mark. Discussions about this problem could be found at[1][2]. This patch (of 3): The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-0-1df9882bb01a@debian.org Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-1-1df9882bb01a@debian.org Link: https://lore.kernel.org/all/20250826180742.f2471131255ec1c43683ea07@linux-foundation.org/ [0] Link: https://lore.kernel.org/all/oninomspajhxp4omtdapxnckxydbk2nzmrix7rggmpukpnzadw@c67o7njgdgm3/ [1] Link: https://lore.kernel.org/all/20250826-akpm-v1-1-3c831f0e3799@debian.org/ [2] Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Acked-by: Baoquan He Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/arm64/kernel/machine_kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52c..410060ebd86d 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; From 8afbd0045922b8146acf1a78ae818693e0468dbd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:22 -0700 Subject: [PATCH 095/450] riscv: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-2-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 4 ++-- arch/riscv/kernel/kexec_image.c | 2 +- arch/riscv/kernel/machine_kexec_file.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index 56444c7bd34e..531d348db84d 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -28,7 +28,7 @@ static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, int i; int ret = 0; size_t size; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; kbuf.image = image; @@ -66,7 +66,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, { int i; int ret; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; unsigned long lowest_paddr = ULONG_MAX; unsigned long lowest_vaddr = ULONG_MAX; diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c index 26a81774a78a..8f2eb900910b 100644 --- a/arch/riscv/kernel/kexec_image.c +++ b/arch/riscv/kernel/kexec_image.c @@ -41,7 +41,7 @@ static void *image_load(struct kimage *image, struct riscv_image_header *h; u64 flags; bool be_image, be_kernel; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; int ret; /* Check Image header */ diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index e36104af2e24..b9eb41b0a975 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -261,7 +261,7 @@ int load_extra_segments(struct kimage *image, unsigned long kernel_start, int ret; void *fdt; unsigned long initrd_pbase = 0UL; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; char *modified_cmdline = NULL; kbuf.image = image; From e67f0bd05519012eaabaae68618ffc4ed30ab680 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:23 -0700 Subject: [PATCH 096/450] s390: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-3-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/s390/kernel/kexec_elf.c | 2 +- arch/s390/kernel/kexec_image.c | 2 +- arch/s390/kernel/machine_kexec_file.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 4d364de43799..143e34a4eca5 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index a32ce8bea745..9a439175723c 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c2bac14dd668..a36d7311c668 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image, static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; int ret; From 3be306cccdccede13e3cefd0c14e430cc2b7c9c7 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 28 Aug 2025 13:38:20 -0500 Subject: [PATCH 097/450] mm/memory-failure: fix redundant updates for already poisoned pages Duplicate memory errors can be reported by multiple sources. Passing an already poisoned page to action_result() causes issues: * The amount of hardware corrupted memory is incorrectly updated. * Per NUMA node MF stats are incorrectly updated. * Redundant "already poisoned" messages are printed. Avoid those issues by: * Skipping hardware corrupted memory updates for already poisoned pages. * Skipping per NUMA node MF stats updates for already poisoned pages. * Dropping redundant "already poisoned" messages. Make MF_MSG_ALREADY_POISONED consistent with other action_page_types and make calls to action_result() consistent for already poisoned normal pages and huge pages. Link: https://lkml.kernel.org/r/aLCiHMy12Ck3ouwC@hpe.com Fixes: b8b9488d50b7 ("mm/memory-failure: improve memory failure action_result messages") Signed-off-by: Kyle Meyer Reviewed-by: Jiaqi Yan Acked-by: David Hildenbrand Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Borislav Betkov Cc: Kyle Meyer Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Russ Anderson Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..10b3c281c2ae 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -956,7 +956,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1349,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -2094,12 +2095,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2285,7 +2285,6 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); From 47ddf62b43eced739b56422cd55e86b9b4bb7dac Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Wed, 3 Sep 2025 18:51:14 +0200 Subject: [PATCH 098/450] Input: xpad - add support for Flydigi Apex 5 Add Flydigi Apex 5 to the list of recognized controllers. Reported-by: Brandon Lin Reported-by: Sergey Belozyorcev Signed-off-by: Antheas Kapenekakis Link: https://lore.kernel.org/r/20250903165114.2987905-1-lkml@antheas.dev Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 4c94297e17e6..d72e89c25e50 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -422,6 +422,7 @@ static const struct xpad_device { { 0x3537, 0x1010, "GameSir G7 SE", 0, XTYPE_XBOXONE }, { 0x366c, 0x0005, "ByoWave Proteus Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE, FLAG_DELAY_INIT }, { 0x3767, 0x0101, "Fanatec Speedster 3 Forceshock Wheel", 0, XTYPE_XBOX }, + { 0x37d7, 0x2501, "Flydigi Apex 5", 0, XTYPE_XBOX360 }, { 0x413d, 0x2104, "Black Shark Green Ghost Gamepad", 0, XTYPE_XBOX360 }, { 0xffff, 0xffff, "Chinese-made Xbox Controller", 0, XTYPE_XBOX }, { 0x0000, 0x0000, "Generic X-Box pad", 0, XTYPE_UNKNOWN } @@ -578,6 +579,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x3537), /* GameSir Controllers */ XPAD_XBOXONE_VENDOR(0x366c), /* ByoWave controllers */ + XPAD_XBOX360_VENDOR(0x37d7), /* Flydigi Controllers */ XPAD_XBOX360_VENDOR(0x413d), /* Black Shark Green Ghost Controller */ { } }; From 68f27f7c7708183e7873c585ded2f1b057ac5b97 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 4 Sep 2025 12:18:50 +0200 Subject: [PATCH 099/450] ASoC: qcom: q6apm-lpass-dais: Fix NULL pointer dereference if source graph failed If earlier opening of source graph fails (e.g. ADSP rejects due to incorrect audioreach topology), the graph is closed and "dai_data->graph[dai->id]" is assigned NULL. Preparing the DAI for sink graph continues though and next call to q6apm_lpass_dai_prepare() receives dai_data->graph[dai->id]=NULL leading to NULL pointer exception: qcom-apm gprsvc:service:2:1: Error (1) Processing 0x01001002 cmd qcom-apm gprsvc:service:2:1: DSP returned error[1001002] 1 q6apm-lpass-dais 30000000.remoteproc:glink-edge:gpr:service@1:bedais: fail to start APM port 78 q6apm-lpass-dais 30000000.remoteproc:glink-edge:gpr:service@1:bedais: ASoC: error at snd_soc_pcm_dai_prepare on TX_CODEC_DMA_TX_3: -22 Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a8 ... Call trace: q6apm_graph_media_format_pcm+0x48/0x120 (P) q6apm_lpass_dai_prepare+0x110/0x1b4 snd_soc_pcm_dai_prepare+0x74/0x108 __soc_pcm_prepare+0x44/0x160 dpcm_be_dai_prepare+0x124/0x1c0 Fixes: 30ad723b93ad ("ASoC: qdsp6: audioreach: add q6apm lpass dai support") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Srinivas Kandagatla Message-ID: <20250904101849.121503-2-krzysztof.kozlowski@linaro.org> Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index a0d90462fd6a..20974f10406b 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -213,8 +213,10 @@ static int q6apm_lpass_dai_prepare(struct snd_pcm_substream *substream, struct s return 0; err: - q6apm_graph_close(dai_data->graph[dai->id]); - dai_data->graph[dai->id] = NULL; + if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { + q6apm_graph_close(dai_data->graph[dai->id]); + dai_data->graph[dai->id] = NULL; + } return rc; } From f81e63047600d023cbfda372b6de8f2821ff6839 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 20 Aug 2025 17:37:15 +0100 Subject: [PATCH 100/450] ASoC: SDCA: Fix return value in sdca_regmap_mbq_size() The MBQ size function returns an integer representing the size of a Control. Currently if the Control is not found the function will return false which makes little sense. Correct this typo to return -EINVAL. Fixes: e3f7caf74b79 ("ASoC: SDCA: Add generic regmap SDCA helpers") Signed-off-by: Charles Keepax Message-ID: <20250820163717.1095846-2-ckeepax@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/sdca/sdca_regmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sdca/sdca_regmap.c b/sound/soc/sdca/sdca_regmap.c index 5cb3048ea8cf..72f893e00ff5 100644 --- a/sound/soc/sdca/sdca_regmap.c +++ b/sound/soc/sdca/sdca_regmap.c @@ -196,7 +196,7 @@ int sdca_regmap_mbq_size(struct sdca_function_data *function, unsigned int reg) control = function_find_control(function, reg); if (!control) - return false; + return -EINVAL; return clamp_val(control->nbits / BITS_PER_BYTE, sizeof(u8), sizeof(u32)); } From 16c912ec34edc4d7daecde58e40d480858830a12 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 20 Aug 2025 17:37:16 +0100 Subject: [PATCH 101/450] ASoC: SDCA: Fix return value in detected_mode_handler() The detected mode IRQ handler should return an irqreturn_t not a regular error code. Correct the return value in detected_mode_handler(). Fixes: b9ab3b618241 ("ASoC: SDCA: Add some initial IRQ handlers") Signed-off-by: Charles Keepax Message-ID: <20250820163717.1095846-3-ckeepax@opensource.cirrus.com> Signed-off-by: Mark Brown --- sound/soc/sdca/sdca_interrupts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c index 8018773ee426..79bf3042f57d 100644 --- a/sound/soc/sdca/sdca_interrupts.c +++ b/sound/soc/sdca/sdca_interrupts.c @@ -155,7 +155,7 @@ static irqreturn_t detected_mode_handler(int irq, void *data) SDCA_CTL_SELECTED_MODE_NAME); if (!name) - return -ENOMEM; + return IRQ_NONE; kctl = snd_soc_component_get_kcontrol(component, name); if (!kctl) { From ec630c2c8ce215dd365b8c3644f004f645714a0f Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 20 Aug 2025 17:37:17 +0100 Subject: [PATCH 102/450] ASoC: SDCA: Reorder members of hide struct to remove holes Remove some padding holes in the sdca_entity_hide struct by reordering the members. Signed-off-by: Charles Keepax Message-ID: <20250820163717.1095846-4-ckeepax@opensource.cirrus.com> Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 06ec126cdcc3..ea68856e4c8c 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1063,27 +1063,30 @@ struct sdca_entity_ge { /** * struct sdca_entity_hide - information specific to HIDE Entities * @hid: HID device structure - * @hidtx_ids: HIDTx Report ID * @num_hidtx_ids: number of HIDTx Report ID - * @hidrx_ids: HIDRx Report ID * @num_hidrx_ids: number of HIDRx Report ID - * @hide_reside_function_num: indicating which Audio Function Numbers within this Device - * @max_delay: the maximum time in microseconds allowed for the Device to change the ownership from Device to Host - * @af_number_list: which Audio Function Numbers within this Device are sending/receiving the messages in this HIDE - * @hid_desc: HID descriptor for the HIDE Entity + * @hidtx_ids: HIDTx Report ID + * @hidrx_ids: HIDRx Report ID + * @af_number_list: which Audio Function Numbers within this Device are + * sending/receiving the messages in this HIDE + * @hide_reside_function_num: indicating which Audio Function Numbers + * within this Device + * @max_delay: the maximum time in microseconds allowed for the Device + * to change the ownership from Device to Host * @hid_report_desc: HID Report Descriptor for the HIDE Entity + * @hid_desc: HID descriptor for the HIDE Entity */ struct sdca_entity_hide { struct hid_device *hid; unsigned int *hidtx_ids; - int num_hidtx_ids; unsigned int *hidrx_ids; + int num_hidtx_ids; int num_hidrx_ids; + unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT]; unsigned int hide_reside_function_num; unsigned int max_delay; - unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT]; - struct hid_descriptor hid_desc; unsigned char *hid_report_desc; + struct hid_descriptor hid_desc; }; /** From c9ddc41cdd522f2db5d492eda3df8994d928be34 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 17 Aug 2025 19:20:22 -0500 Subject: [PATCH 103/450] Input: iqs7222 - avoid enabling unused interrupts If a proximity event node is defined so as to specify the wake-up properties of the touch surface, the proximity event interrupt is enabled unconditionally. This may result in unwanted interrupts. Solve this problem by enabling the interrupt only if the event is mapped to a key or switch code. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/aKJxxgEWpNaNcUaW@nixie71 Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/misc/iqs7222.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/misc/iqs7222.c b/drivers/input/misc/iqs7222.c index 6fac31c0d99f..ff23219a582a 100644 --- a/drivers/input/misc/iqs7222.c +++ b/drivers/input/misc/iqs7222.c @@ -2427,6 +2427,9 @@ static int iqs7222_parse_chan(struct iqs7222_private *iqs7222, if (error) return error; + if (!iqs7222->kp_type[chan_index][i]) + continue; + if (!dev_desc->event_offset) continue; From 1939a9fcb80353dd8b111aa1e79c691afbde08b4 Mon Sep 17 00:00:00 2001 From: Christoffer Sandberg Date: Tue, 26 Aug 2025 16:26:06 +0200 Subject: [PATCH 104/450] Input: i8042 - add TUXEDO InfinityBook Pro Gen10 AMD to i8042 quirk table Occasionally wakes up from suspend with missing input on the internal keyboard. Setting the quirks appears to fix the issue for this device as well. Signed-off-by: Christoffer Sandberg Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250826142646.13516-1-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-acpipnpio.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 6ed9fc34948c..1caa6c4ca435 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1155,6 +1155,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxHP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "XxKK4NAx_XxSP4NAx"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, /* * A lot of modern Clevo barebones have touchpad and/or keyboard issues * after suspend fixable with the forcenorestore quirk. From a00f2015acdbd8a4b3d2382eaeebe11db1925fad Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Wed, 3 Sep 2025 12:21:33 -0700 Subject: [PATCH 105/450] drm/panthor: validate group queue count A panthor group can have at most MAX_CS_PER_CSG panthor queues. Fixes: 4bdca11507928 ("drm/panthor: Add the driver frontend block") Signed-off-by: Chia-I Wu Reviewed-by: Boris Brezillon # v1 Reviewed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20250903192133.288477-1-olvaffe@gmail.com --- drivers/gpu/drm/panthor/panthor_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index 1116f2d2826e..4d8e9b34702a 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -1094,7 +1094,7 @@ static int panthor_ioctl_group_create(struct drm_device *ddev, void *data, struct drm_panthor_queue_create *queue_args; int ret; - if (!args->queues.count) + if (!args->queues.count || args->queues.count > MAX_CS_PER_CSG) return -EINVAL; ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues); From d0f61658db58583b3acd1ada70a5352c39cd0388 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 1 Sep 2025 09:44:04 +0200 Subject: [PATCH 106/450] ASoC: codecs: lpass-rx-macro: Fix playback quality distortion Commit bb4a0f497bc1 ("ASoC: codecs: lpass: Drop unused AIF_INVALID first DAI identifier") removed first entry in enum with DAI identifiers, because it looked unused. Turns out that there is a relation between DAI ID and "RX_MACRO RX0 MUX"-like kcontrols which use "rx_macro_mux_text" array. That "rx_macro_mux_text" array used first three entries of DAI IDs enum, with value '0' being invalid. The value passed tp "RX_MACRO RX0 MUX"-like kcontrols was used as DAI ID and set to configure active channel count and mask, which are arrays indexed by DAI ID. After removal of first AIF_INVALID DAI identifier, this kcontrol was updating wrong entries in active channel count and mask arrays which was visible in reduced quality (distortions) during headset playback on the Qualcomm SM8750 MTP8750 board. It seems it also fixes recording silence (instead of actual sound) via headset, even though that's different macro codec. Reported-by: Alexey Klimov Fixes: bb4a0f497bc1 ("ASoC: codecs: lpass: Drop unused AIF_INVALID first DAI identifier") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Message-ID: <20250901074403.137263-2-krzysztof.kozlowski@linaro.org> Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-rx-macro.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/lpass-rx-macro.c b/sound/soc/codecs/lpass-rx-macro.c index 238dbdb46c18..a8fc842cc94e 100644 --- a/sound/soc/codecs/lpass-rx-macro.c +++ b/sound/soc/codecs/lpass-rx-macro.c @@ -618,6 +618,7 @@ static struct interp_sample_rate sr_val_tbl[] = { {176400, 0xB}, {352800, 0xC}, }; +/* Matches also rx_macro_mux_text */ enum { RX_MACRO_AIF1_PB, RX_MACRO_AIF2_PB, @@ -722,6 +723,7 @@ static const char * const rx_int2_2_interp_mux_text[] = { "ZERO", "RX INT2_2 MUX", }; +/* Order must match RX_MACRO_MAX_DAIS enum (offset by 1) */ static const char *const rx_macro_mux_text[] = { "ZERO", "AIF1_PB", "AIF2_PB", "AIF3_PB", "AIF4_PB" }; @@ -2474,6 +2476,7 @@ static int rx_macro_mux_put(struct snd_kcontrol *kcontrol, struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; struct snd_soc_dapm_update *update = NULL; u32 rx_port_value = ucontrol->value.enumerated.item[0]; + unsigned int dai_id; u32 aif_rst; struct rx_macro *rx = snd_soc_component_get_drvdata(component); @@ -2490,19 +2493,24 @@ static int rx_macro_mux_put(struct snd_kcontrol *kcontrol, switch (rx_port_value) { case 0: - if (rx->active_ch_cnt[aif_rst]) { - clear_bit(widget->shift, - &rx->active_ch_mask[aif_rst]); - rx->active_ch_cnt[aif_rst]--; + /* + * active_ch_cnt and active_ch_mask use DAI IDs (RX_MACRO_MAX_DAIS). + * active_ch_cnt == 0 was tested in if() above. + */ + dai_id = aif_rst - 1; + if (rx->active_ch_cnt[dai_id]) { + clear_bit(widget->shift, &rx->active_ch_mask[dai_id]); + rx->active_ch_cnt[dai_id]--; } break; case 1: case 2: case 3: case 4: - set_bit(widget->shift, - &rx->active_ch_mask[rx_port_value]); - rx->active_ch_cnt[rx_port_value]++; + /* active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS). */ + dai_id = rx_port_value - 1; + set_bit(widget->shift, &rx->active_ch_mask[dai_id]); + rx->active_ch_cnt[dai_id]++; break; default: dev_err(component->dev, From 9004a450fccbeb40a71cc173747da37a459fd4dc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Aug 2025 17:14:02 +0200 Subject: [PATCH 107/450] ASoC: codecs: lpass-wsa-macro: Fix speaker quality distortion Commit bb4a0f497bc1 ("ASoC: codecs: lpass: Drop unused AIF_INVALID first DAI identifier") removed first entry in enum with DAI identifiers, because it looked unused. Turns out that there is a relation between DAI ID and "WSA RX0 Mux"-like kcontrols (which use "rx_mux_text" array). That "rx_mux_text" array used first three entries of DAI IDs enum, with value '0' being invalid. The value passed tp "WSA RX0 Mux"-like kcontrols was used as DAI ID and set to configure active channel count and mask, which are arrays indexed by DAI ID. After removal of first AIF_INVALID DAI identifier, this kcontrol was updating wrong entries in active channel count and mask arrays which was visible in reduced quality (distortions) during speaker playback on several boards like Lenovo T14s laptop and Qualcomm SM8550-based boards. Reported-by: Alexey Klimov Fixes: bb4a0f497bc1 ("ASoC: codecs: lpass: Drop unused AIF_INVALID first DAI identifier") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Message-ID: <20250831151401.30897-2-krzysztof.kozlowski@linaro.org> Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-wsa-macro.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/lpass-wsa-macro.c b/sound/soc/codecs/lpass-wsa-macro.c index da6adb3de21d..d7eec9fdaf9c 100644 --- a/sound/soc/codecs/lpass-wsa-macro.c +++ b/sound/soc/codecs/lpass-wsa-macro.c @@ -368,6 +368,7 @@ static struct interp_sample_rate int_mix_sample_rate_val[] = { {192000, 0x6}, /* 192K */ }; +/* Matches also rx_mux_text */ enum { WSA_MACRO_AIF1_PB, WSA_MACRO_AIF_MIX1_PB, @@ -465,6 +466,7 @@ static const char *const rx_mix_ec_text[] = { "ZERO", "RX_MIX_TX0", "RX_MIX_TX1" }; +/* Order must match WSA_MACRO_MAX_DAIS enum (offset by 1) */ static const char *const rx_mux_text[] = { "ZERO", "AIF1_PB", "AIF_MIX1_PB" }; @@ -2207,6 +2209,7 @@ static int wsa_macro_rx_mux_put(struct snd_kcontrol *kcontrol, u32 rx_port_value = ucontrol->value.integer.value[0]; u32 bit_input; u32 aif_rst; + unsigned int dai_id; struct wsa_macro *wsa = snd_soc_component_get_drvdata(component); aif_rst = wsa->rx_port_value[widget->shift]; @@ -2224,17 +2227,22 @@ static int wsa_macro_rx_mux_put(struct snd_kcontrol *kcontrol, switch (rx_port_value) { case 0: - if (wsa->active_ch_cnt[aif_rst]) { - clear_bit(bit_input, - &wsa->active_ch_mask[aif_rst]); - wsa->active_ch_cnt[aif_rst]--; + /* + * active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS). + * active_ch_cnt == 0 was tested in if() above. + */ + dai_id = aif_rst - 1; + if (wsa->active_ch_cnt[dai_id]) { + clear_bit(bit_input, &wsa->active_ch_mask[dai_id]); + wsa->active_ch_cnt[dai_id]--; } break; case 1: case 2: - set_bit(bit_input, - &wsa->active_ch_mask[rx_port_value]); - wsa->active_ch_cnt[rx_port_value]++; + /* active_ch_cnt and active_ch_mask use DAI IDs (WSA_MACRO_MAX_DAIS). */ + dai_id = rx_port_value - 1; + set_bit(bit_input, &wsa->active_ch_mask[dai_id]); + wsa->active_ch_cnt[dai_id]++; break; default: dev_err(component->dev, From 157cf360c4a8751f7f511a71cc3a283b5d27f889 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Thu, 4 Sep 2025 10:43:22 +0800 Subject: [PATCH 108/450] net: libwx: fix to enable RSS Now when SRIOV is enabled, PF with multiple queues can only receive all packets on queue 0. This is caused by an incorrect flag judgement, which prevents RSS from being enabled. In fact, RSS is supported for the functions when SRIOV is enabled. Remove the flag judgement to fix it. Fixes: c52d4b898901 ("net: libwx: Redesign flow when sriov is enabled") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/A3B7449A08A044D0+20250904024322.87145-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index bcd07a715752..5cb353a97d6d 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2078,10 +2078,6 @@ static void wx_setup_mrqc(struct wx *wx) { u32 rss_field = 0; - /* VT, and RSS do not coexist at the same time */ - if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) - return; - /* Disable indicating checksum in descriptor, enables RSS hash */ wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD); From 349510052f765b6eb9c2a21d0ffe08ba61fa683c Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 1 Sep 2025 22:26:39 +0200 Subject: [PATCH 109/450] MAINTAINERS: Add drm-rust tree for Rust DRM drivers and infrastructure Multiple DRM Rust drivers (e.g. nova-core, nova-drm, Tyr, rvkms) are in development, with at least Nova and (soon) Tyr already upstream. Having a shared tree will ease and accelerate development, since all drivers can consume new infrastructure in the same release cycle. This includes infrastructure shared with other subsystem trees (e.g. Rust or driver-core). By consolidating in drm-rust, we avoid adding extra burden to drm-misc maintainers, e.g. dealing with cross-tree topic branches. The drm-misc tree is not a good fit for this stage of development, since its documented scope is small drivers with occasional large series. Rust drivers in development upstream, however, regularly involve large patch series, new infrastructure, and shared topic branches, which may not align well with drm-misc at this stage. The drm-rust tree may not be a permanent solution. Once the core Rust, DRM, and KMS infrastructure have stabilized, drivers and infrastructure changes are expected to transition into drm-misc or standalone driver trees respectively. Until then, drm-rust provides a dedicated place to coordinate development without disrupting existing workflows too much. Cc: David Airlie Cc: Simona Vetter Cc: Maarten Lankhorst Cc: Thomas Zimmermann Signed-off-by: Danilo Krummrich Acked-by: Janne Grunau Acked-by: Maxime Ripard Acked-by: Alexandre Courbot Acked-by: Jani Nikula Acked-by: Daniel Almeida Acked-by: Alice Ryhl Link: https://lore.kernel.org/r/20250901202850.208116-1-dakr@kernel.org Signed-off-by: Alice Ryhl --- MAINTAINERS | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..2b155a12f776 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8079,7 +8079,6 @@ F: Documentation/devicetree/bindings/gpu/ F: Documentation/gpu/ F: drivers/gpu/drm/ F: drivers/gpu/vga/ -F: rust/kernel/drm/ F: include/drm/drm F: include/linux/vga* F: include/uapi/drm/ @@ -8096,6 +8095,16 @@ X: drivers/gpu/drm/radeon/ X: drivers/gpu/drm/tegra/ X: drivers/gpu/drm/xe/ +DRM DRIVERS AND COMMON INFRASTRUCTURE [RUST] +M: Danilo Krummrich +M: Alice Ryhl +S: Supported +W: https://drm.pages.freedesktop.org/maintainer-tools/drm-rust.html +T: git https://gitlab.freedesktop.org/drm/rust/kernel.git +F: drivers/gpu/drm/nova/ +F: drivers/gpu/nova-core/ +F: rust/kernel/drm/ + DRM DRIVERS FOR ALLWINNER A10 M: Maxime Ripard M: Chen-Yu Tsai From 923b70581cb6acede90f8aaf4afe5d1c58c67b71 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 22 Aug 2025 10:49:15 +0800 Subject: [PATCH 110/450] iommu/amd: Fix ivrs_base memleak in early_amd_iommu_init() Fix a permanent ACPI table memory leak in early_amd_iommu_init() when CMPXCHG16B feature is not supported Fixes: 82582f85ed22 ("iommu/amd: Disable AMD IOMMU if CMPXCHG16B feature is not supported") Cc: stable@vger.kernel.org Signed-off-by: Zhen Ni Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/20250822024915.673427-1-zhen.ni@easystack.cn Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 8de689b2c5ed..6795326a6524 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3067,7 +3067,8 @@ static int __init early_amd_iommu_init(void) if (!boot_cpu_has(X86_FEATURE_CX16)) { pr_err("Failed to initialize. The CMPXCHG16B feature is required.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } /* From b3506e9bcc777ed6af2ab631c86a9990ed97b474 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 27 Aug 2025 17:08:27 -0400 Subject: [PATCH 111/450] iommu/s390: Fix memory corruption when using identity domain zpci_get_iommu_ctrs() returns counter information to be reported as part of device statistics; these counters are stored as part of the s390_domain. The problem, however, is that the identity domain is not backed by an s390_domain and so the conversion via to_s390_domain() yields a bad address that is zero'd initially and read on-demand later via a sysfs read. These counters aren't necessary for the identity domain; just return NULL in this case. This issue was discovered via KASAN with reports that look like: BUG: KASAN: global-out-of-bounds in zpci_fmb_enable_device when using the identity domain for a device on s390. Cc: stable@vger.kernel.org Fixes: 64af12c6ec3a ("iommu/s390: implement iommu passthrough via identity domain") Reported-by: Cam Miller Signed-off-by: Matthew Rosato Tested-by: Cam Miller Reviewed-by: Farhan Ali Reviewed-by: Niklas Schnelle Link: https://lore.kernel.org/r/20250827210828.274527-1-mjrosato@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 9c80d61deb2c..d7370347c910 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -1032,7 +1032,8 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) lockdep_assert_held(&zdev->dom_lock); - if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED || + zdev->s390_domain->type == IOMMU_DOMAIN_IDENTITY) return NULL; s390_domain = to_s390_domain(zdev->s390_domain); From dce043c07ca1ac19cfbe2844a6dc71e35c322353 Mon Sep 17 00:00:00 2001 From: Eugene Koira Date: Wed, 3 Sep 2025 13:53:29 +0800 Subject: [PATCH 112/450] iommu/vt-d: Fix __domain_mapping()'s usage of switch_to_super_page() switch_to_super_page() assumes the memory range it's working on is aligned to the target large page level. Unfortunately, __domain_mapping() doesn't take this into account when using it, and will pass unaligned ranges ultimately freeing a PTE range larger than expected. Take for example a mapping with the following iov_pfn range [0x3fe400, 0x4c0600), which should be backed by the following mappings: iov_pfn [0x3fe400, 0x3fffff] covered by 2MiB pages iov_pfn [0x400000, 0x4bffff] covered by 1GiB pages iov_pfn [0x4c0000, 0x4c05ff] covered by 2MiB pages Under this circumstance, __domain_mapping() will pass [0x400000, 0x4c05ff] to switch_to_super_page() at a 1 GiB granularity, which will in turn free PTEs all the way to iov_pfn 0x4fffff. Mitigate this by rounding down the iov_pfn range passed to switch_to_super_page() in __domain_mapping() to the target large page level. Additionally add range alignment checks to switch_to_super_page. Fixes: 9906b9352a35 ("iommu/vt-d: Avoid duplicate removing in __domain_mapping()") Signed-off-by: Eugene Koira Cc: stable@vger.kernel.org Reviewed-by: Nicolas Saenz Julienne Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9c3ab9d9f69a..dff2d895b8ab 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1575,6 +1575,10 @@ static void switch_to_super_page(struct dmar_domain *domain, unsigned long lvl_pages = lvl_to_nr_pages(level); struct dma_pte *pte = NULL; + if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || + !IS_ALIGNED(end_pfn + 1, lvl_pages))) + return; + while (start_pfn <= end_pfn) { if (!pte) pte = pfn_to_dma_pte(domain, start_pfn, &level, @@ -1650,7 +1654,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long pages_to_remove; pteval |= DMA_PTE_LARGE_PAGE; - pages_to_remove = min_t(unsigned long, nr_pages, + pages_to_remove = min_t(unsigned long, + round_down(nr_pages, lvl_pages), nr_pte_to_next_page(pte) * lvl_pages); end_pfn = iov_pfn + pages_to_remove - 1; switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); From 9ffaf5229055fcfbb3b3d6f1c7e58d63715c3f73 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 4 Sep 2025 10:59:49 +0200 Subject: [PATCH 113/450] iommu/s390: Make attach succeed when the device was surprise removed When a PCI device is removed with surprise hotplug, there may still be attempts to attach the device to the default domain as part of tear down via (__iommu_release_dma_ownership()), or because the removal happens during probe (__iommu_probe_device()). In both cases zpci_register_ioat() fails with a cc value indicating that the device handle is invalid. This is because the device is no longer part of the instance as far as the hypervisor is concerned. Currently this leads to an error return and s390_iommu_attach_device() fails. This triggers the WARN_ON() in __iommu_group_set_domain_nofail() because attaching to the default domain must never fail. With the device fenced by the hypervisor no DMAs to or from memory are possible and the IOMMU translations have no effect. Proceed as if the registration was successful and let the hotplug event handling clean up the device. This is similar to how devices in the error state are handled since commit 59bbf596791b ("iommu/s390: Make attach succeed even if the device is in error state") except that for removal the domain will not be registered later. This approach was also previously discussed at the link. Handle both cases, error state and removal, in a helper which checks if the error needs to be propagated or ignored. Avoid magic number condition codes by using the pre-existing, but never used, defines for PCI load/store condition codes and rename them to reflect that they apply to all PCI instructions. Cc: stable@vger.kernel.org # v6.2 Link: https://lore.kernel.org/linux-iommu/20240808194155.GD1985367@ziepe.ca/ Suggested-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Reviewed-by: Matthew Rosato Reviewed-by: Benjamin Block Link: https://lore.kernel.org/r/20250904-iommu_succeed_attach_removed-v1-1-e7f333d2f80f@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci_insn.h | 10 +++++----- drivers/iommu/s390-iommu.c | 26 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index e5f57cfe1d45..025c6dcbf893 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -16,11 +16,11 @@ #define ZPCI_PCI_ST_FUNC_NOT_AVAIL 40 #define ZPCI_PCI_ST_ALREADY_IN_RQ_STATE 44 -/* Load/Store return codes */ -#define ZPCI_PCI_LS_OK 0 -#define ZPCI_PCI_LS_ERR 1 -#define ZPCI_PCI_LS_BUSY 2 -#define ZPCI_PCI_LS_INVAL_HANDLE 3 +/* PCI instruction condition codes */ +#define ZPCI_CC_OK 0 +#define ZPCI_CC_ERR 1 +#define ZPCI_CC_BUSY 2 +#define ZPCI_CC_INVAL_HANDLE 3 /* Load/Store address space identifiers */ #define ZPCI_PCIAS_MEMIO_0 0 diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index d7370347c910..aa576736d60b 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -612,6 +612,23 @@ static u64 get_iota_region_flag(struct s390_domain *domain) } } +static bool reg_ioat_propagate_error(int cc, u8 status) +{ + /* + * If the device is in the error state the reset routine + * will register the IOAT of the newly set domain on re-enable + */ + if (cc == ZPCI_CC_ERR && status == ZPCI_PCI_ST_FUNC_NOT_AVAIL) + return false; + /* + * If the device was removed treat registration as success + * and let the subsequent error event trigger tear down. + */ + if (cc == ZPCI_CC_INVAL_HANDLE) + return false; + return cc != ZPCI_CC_OK; +} + static int s390_iommu_domain_reg_ioat(struct zpci_dev *zdev, struct iommu_domain *domain, u8 *status) { @@ -696,7 +713,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev->dma_table = s390_domain->dma_table; zdev_s390_domain_update(zdev, domain); @@ -1124,12 +1141,7 @@ static int s390_attach_dev_identity(struct iommu_domain *domain, /* If we fail now DMA remains blocked via blocking domain */ cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); - - /* - * If the device is undergoing error recovery the reset code - * will re-establish the new domain. - */ - if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) + if (reg_ioat_propagate_error(cc, status)) return -EIO; zdev_s390_domain_update(zdev, domain); From e1bf212d0604d2cbb5514e47ccec252b656071fb Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Thu, 4 Sep 2025 20:01:19 +0800 Subject: [PATCH 114/450] fuse: virtio_fs: fix page fault for DAX page address The commit ced17ee32a99 ("Revert "virtio: reject shm region if length is zero"") exposes the following DAX page fault bug (this fix the failure that getting shm region alway returns false because of zero length): The commit 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") handles the DAX physical page address incorrectly: the removed macro 'phys_to_pfn_t()' should be replaced with 'PHYS_PFN()'. [ 1.390321] BUG: unable to handle page fault for address: ffffd3fb40000008 [ 1.390875] #PF: supervisor read access in kernel mode [ 1.391257] #PF: error_code(0x0000) - not-present page [ 1.391509] PGD 0 P4D 0 [ 1.391626] Oops: Oops: 0000 [#1] SMP NOPTI [ 1.391806] CPU: 6 UID: 1000 PID: 162 Comm: weston Not tainted 6.17.0-rc3-WSL2-STABLE #2 PREEMPT(none) [ 1.392361] RIP: 0010:dax_to_folio+0x14/0x60 [ 1.392653] Code: 52 c9 c3 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 c1 ef 05 48 c1 e7 06 48 03 3d 34 b5 31 01 <48> 8b 57 08 48 89 f8 f6 c2 01 75 2b 66 90 c3 cc cc cc cc f7 c7 ff [ 1.393727] RSP: 0000:ffffaf7d04407aa8 EFLAGS: 00010086 [ 1.394003] RAX: 000000a000000000 RBX: ffffaf7d04407bb0 RCX: 0000000000000000 [ 1.394524] RDX: ffffd17b40000008 RSI: 0000000000000083 RDI: ffffd3fb40000000 [ 1.394967] RBP: 0000000000000011 R08: 000000a000000000 R09: 0000000000000000 [ 1.395400] R10: 0000000000001000 R11: ffffaf7d04407c10 R12: 0000000000000000 [ 1.395806] R13: ffffa020557be9c0 R14: 0000014000000001 R15: 0000725970e94000 [ 1.396268] FS: 000072596d6d2ec0(0000) GS:ffffa0222dc59000(0000) knlGS:0000000000000000 [ 1.396715] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.397100] CR2: ffffd3fb40000008 CR3: 000000011579c005 CR4: 0000000000372ef0 [ 1.397518] Call Trace: [ 1.397663] [ 1.397900] dax_insert_entry+0x13b/0x390 [ 1.398179] dax_fault_iter+0x2a5/0x6c0 [ 1.398443] dax_iomap_pte_fault+0x193/0x3c0 [ 1.398750] __fuse_dax_fault+0x8b/0x270 [ 1.398997] ? vm_mmap_pgoff+0x161/0x210 [ 1.399175] __do_fault+0x30/0x180 [ 1.399360] do_fault+0xc4/0x550 [ 1.399547] __handle_mm_fault+0x8e3/0xf50 [ 1.399731] ? do_syscall_64+0x72/0x1e0 [ 1.399958] handle_mm_fault+0x192/0x2f0 [ 1.400204] do_user_addr_fault+0x20e/0x700 [ 1.400418] exc_page_fault+0x66/0x150 [ 1.400602] asm_exc_page_fault+0x26/0x30 [ 1.400831] RIP: 0033:0x72596d1bf703 [ 1.401076] Code: 31 f6 45 31 e4 48 8d 15 b3 73 00 00 e8 06 03 00 00 8b 83 68 01 00 00 e9 8e fa ff ff 0f 1f 00 48 8b 44 24 08 4c 89 ee 48 89 df 00 21 43 34 12 e8 72 09 00 00 e9 6a fa ff ff 0f 1f 44 00 00 e8 [ 1.402172] RSP: 002b:00007ffc350f6dc0 EFLAGS: 00010202 [ 1.402488] RAX: 0000725970e94000 RBX: 00005b7c642c2560 RCX: 0000725970d359a7 [ 1.402898] RDX: 0000000000000003 RSI: 00007ffc350f6dc0 RDI: 00005b7c642c2560 [ 1.403284] RBP: 00007ffc350f6e90 R08: 000000000000000d R09: 0000000000000000 [ 1.403634] R10: 00007ffc350f6dd8 R11: 0000000000000246 R12: 0000000000000001 [ 1.404078] R13: 00007ffc350f6dc0 R14: 0000725970e29ce0 R15: 0000000000000003 [ 1.404450] [ 1.404570] Modules linked in: [ 1.404821] CR2: ffffd3fb40000008 [ 1.405029] ---[ end trace 0000000000000000 ]--- [ 1.405323] RIP: 0010:dax_to_folio+0x14/0x60 [ 1.405556] Code: 52 c9 c3 00 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 c1 ef 05 48 c1 e7 06 48 03 3d 34 b5 31 01 <48> 8b 57 08 48 89 f8 f6 c2 01 75 2b 66 90 c3 cc cc cc cc f7 c7 ff [ 1.406639] RSP: 0000:ffffaf7d04407aa8 EFLAGS: 00010086 [ 1.406910] RAX: 000000a000000000 RBX: ffffaf7d04407bb0 RCX: 0000000000000000 [ 1.407379] RDX: ffffd17b40000008 RSI: 0000000000000083 RDI: ffffd3fb40000000 [ 1.407800] RBP: 0000000000000011 R08: 000000a000000000 R09: 0000000000000000 [ 1.408246] R10: 0000000000001000 R11: ffffaf7d04407c10 R12: 0000000000000000 [ 1.408666] R13: ffffa020557be9c0 R14: 0000014000000001 R15: 0000725970e94000 [ 1.409170] FS: 000072596d6d2ec0(0000) GS:ffffa0222dc59000(0000) knlGS:0000000000000000 [ 1.409608] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.409977] CR2: ffffd3fb40000008 CR3: 000000011579c005 CR4: 0000000000372ef0 [ 1.410437] Kernel panic - not syncing: Fatal exception [ 1.410857] Kernel Offset: 0xc000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Fixes: 21aa65bf82a7 ("mm: remove callers of pfn_t functionality") Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/20250904120339.972-1-haiyuewa@163.com Acked-by: David Hildenbrand Reviewed-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index c826e7ca49f5..76c8fd0bfc75 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1016,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = fs->window_phys_addr + offset; + *pfn = PHYS_PFN(fs->window_phys_addr + offset); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } From 4550d33e18112a11a740424c4eec063cd58e918c Mon Sep 17 00:00:00 2001 From: Santhosh Kumar K Date: Thu, 4 Sep 2025 18:47:41 +0530 Subject: [PATCH 115/450] mtd: spinand: winbond: Fix oob_layout for W25N01JW Fix the W25N01JW's oob_layout according to the datasheet [1] [1] https://www.winbond.com/hq/product/code-storage-flash-memory/qspinand-flash/?__locale=en&partNo=W25N01JW Fixes: 6a804fb72de5 ("mtd: spinand: winbond: add support for serial NAND flash") Cc: Sridharan S N Cc: stable@vger.kernel.org Signed-off-by: Santhosh Kumar K Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 37 +++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 87053389a1fc..4870b2d5edb2 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -176,6 +176,36 @@ static const struct mtd_ooblayout_ops w25n02kv_ooblayout = { .free = w25n02kv_ooblayout_free, }; +static int w25n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section) + 12; + region->length = 4; + + return 0; +} + +static int w25n01jw_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section > 3) + return -ERANGE; + + region->offset = (16 * section); + region->length = 12; + + /* Extract BBM */ + if (!section) { + region->offset += 2; + region->length -= 2; + } + + return 0; +} + static int w35n01jw_ooblayout_ecc(struct mtd_info *mtd, int section, struct mtd_oob_region *region) { @@ -206,6 +236,11 @@ static int w35n01jw_ooblayout_free(struct mtd_info *mtd, int section, return 0; } +static const struct mtd_ooblayout_ops w25n01jw_ooblayout = { + .ecc = w25n01jw_ooblayout_ecc, + .free = w25n01jw_ooblayout_free, +}; + static const struct mtd_ooblayout_ops w35n01jw_ooblayout = { .ecc = w35n01jw_ooblayout_ecc, .free = w35n01jw_ooblayout_free, @@ -394,7 +429,7 @@ static const struct spinand_info winbond_spinand_table[] = { &write_cache_variants, &update_cache_variants), 0, - SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL), + SPINAND_ECCINFO(&w25n01jw_ooblayout, NULL), SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)), SPINAND_INFO("W25N01KV", /* 3.3V */ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xae, 0x21), From 5a91f52c8650334aaf8c4c7c90f40c6906994225 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Aug 2025 10:29:23 -0400 Subject: [PATCH 116/450] MAINTAINERS: update btrfs entry This is an update to reflect reality, not a signal of any seismic change. Dave Sterba has been the acting maintainer for almost a decade, I've simply been here as a backstop in case he gets hit by a bus. The fact is we have a strong and thriving community with any number of more active developers that can take on that role if it's necessary. I'm exceedingly happy and proud of the work that Dave has done in keeping us all in line, and know that if further changes need to be made it'll be with the development community we've built throughout the lifetime of btrfs. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 10850512c118..bc4f57d179d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5157,7 +5157,6 @@ F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason -M: Josef Bacik M: David Sterba L: linux-btrfs@vger.kernel.org S: Maintained From 3d1267475b94b3df7a61e4ea6788c7c5d9e473c4 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 2 Sep 2025 11:34:10 +0100 Subject: [PATCH 117/450] btrfs: don't allow adding block device of less than 1 MB Commit 15ae0410c37a79 ("btrfs-progs: add error handling for device_get_partition_size_fd_stat()") in btrfs-progs inadvertently changed it so that if the BLKGETSIZE64 ioctl on a block device returned a size of 0, this was no longer seen as an error condition. Unfortunately this is how disconnected NBD devices behave, meaning that with btrfs-progs 6.16 it's now possible to add a device you can't remove: # btrfs device add /dev/nbd0 /root/temp # btrfs device remove /dev/nbd0 /root/temp ERROR: error removing device '/dev/nbd0': Invalid argument This check should always have been done kernel-side anyway, so add a check in btrfs_init_new_device() that the new device doesn't have a size less than BTRFS_DEVICE_RANGE_RESERVED (i.e. 1 MB). Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fa7a929a0461..c6e3efd6f602 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2722,6 +2722,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error; } + if (bdev_nr_bytes(file_bdev(bdev_file)) <= BTRFS_DEVICE_RANGE_RESERVED) { + ret = -EINVAL; + goto error; + } + if (fs_devices->seeding) { seeding_dev = true; down_write(&sb->s_umount); From 992203a1fba51b025c60ec0c8b0d9223343dea95 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 7 Aug 2025 12:49:38 -0400 Subject: [PATCH 118/450] nfs/localio: restore creds before releasing pageio data Otherwise if the nfsd filecache code releases the nfsd_file immediately, it can trigger the BUG_ON(cred == current->cred) in __put_cred() when it puts the nfsd_file->nf_file->f-cred. Fixes: b9f5dd57f4a5 ("nfs/localio: use dedicated workqueues for filesystem read and write") Signed-off-by: Scott Mayhew Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20250807164938.2395136-1-smayhew@redhat.com Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bd5fca285899..bdb82a19136a 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -453,12 +453,13 @@ static void nfs_local_call_read(struct work_struct *work) nfs_local_iter_init(&iter, iocb, READ); status = filp->f_op->read_iter(&iocb->kiocb, &iter); + + revert_creds(save_cred); + if (status != -EIOCBQUEUED) { nfs_local_read_done(iocb, status); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); } static int @@ -648,14 +649,15 @@ static void nfs_local_call_write(struct work_struct *work) file_start_write(filp); status = filp->f_op->write_iter(&iocb->kiocb, &iter); file_end_write(filp); + + revert_creds(save_cred); + current->flags = old_flags; + if (status != -EIOCBQUEUED) { nfs_local_write_done(iocb, status); nfs_local_vfs_getattr(iocb); nfs_local_pgio_release(iocb); } - - revert_creds(save_cred); - current->flags = old_flags; } static int From 80d03a40837a9b26750a25122b906c052cc846c9 Mon Sep 17 00:00:00 2001 From: Vladimir Riabchun Date: Tue, 26 Aug 2025 18:16:46 +0200 Subject: [PATCH 119/450] ftrace/samples: Fix function size computation In my_tramp1 function .size directive was placed above ASM_RET instruction, leading to a wrong function size. Link: https://lore.kernel.org/aK3d7vxNcO52kEmg@vova-pc Fixes: 9d907f1ae80b ("samples/ftrace: Fix asm function ELF annotations") Signed-off-by: Vladimir Riabchun Signed-off-by: Steven Rostedt (Google) --- samples/ftrace/ftrace-direct-modify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index cfea7a38befb..da3a9f2091f5 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -75,8 +75,8 @@ asm ( CALL_DEPTH_ACCOUNT " call my_direct_func1\n" " leave\n" -" .size my_tramp1, .-my_tramp1\n" ASM_RET +" .size my_tramp1, .-my_tramp1\n" " .type my_tramp2, @function\n" " .globl my_tramp2\n" From 2c334d038466ac509468fbe06905a32d202117db Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Sat, 23 Aug 2025 12:34:56 +0200 Subject: [PATCH 120/450] power: supply: bq27xxx: fix error return in case of no bq27000 hdq battery Since commit commit f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") the console log of some devices with hdq enabled but no bq27000 battery (like e.g. the Pandaboard) is flooded with messages like: [ 34.247833] power_supply bq27000-battery: driver failed to report 'status' property: -1 as soon as user-space is finding a /sys entry and trying to read the "status" property. It turns out that the offending commit changes the logic to now return the value of cache.flags if it is <0. This is likely under the assumption that it is an error number. In normal errors from bq27xxx_read() this is indeed the case. But there is special code to detect if no bq27000 is installed or accessible through hdq/1wire and wants to report this. In that case, the cache.flags are set historically by commit 3dd843e1c26a ("bq27000: report missing device better.") to constant -1 which did make reading properties return -ENODEV. So everything appeared to be fine before the return value was passed upwards. Now the -1 is returned as -EPERM instead of -ENODEV, triggering the error condition in power_supply_format_property() which then floods the console log. So we change the detection of missing bq27000 battery to simply set cache.flags = -ENODEV instead of -1. Fixes: f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Cc: Jerry Lv Cc: stable@vger.kernel.org Signed-off-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/692f79eb6fd541adb397038ea6e750d4de2deddf.1755945297.git.hns@goldelico.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 93dcebbe1141..efe02ad695a6 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1920,7 +1920,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); if ((cache.flags & 0xff) == 0xff) - cache.flags = -1; /* read error */ + cache.flags = -ENODEV; /* read error */ if (cache.flags >= 0) { cache.capacity = bq27xxx_battery_read_soc(di); From 1e451977e1703b6db072719b37cd1b8e250b9cc9 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Sat, 23 Aug 2025 12:34:57 +0200 Subject: [PATCH 121/450] power: supply: bq27xxx: restrict no-battery detection to bq27000 There are fuel gauges in the bq27xxx series (e.g. bq27z561) which may in some cases report 0xff as the value of BQ27XXX_REG_FLAGS that should not be interpreted as "no battery" like for a disconnected battery with some built in bq27000 chip. So restrict the no-battery detection originally introduced by commit 3dd843e1c26a ("bq27000: report missing device better.") to the bq27000. There is no need to backport further because this was hidden before commit f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Fixes: f16d9fb6cf03 ("power: supply: bq27xxx: Retrieve again when busy") Suggested-by: Jerry Lv Cc: stable@vger.kernel.org Signed-off-by: H. Nikolaus Schaller Link: https://lore.kernel.org/r/dd979fa6855fd051ee5117016c58daaa05966e24.1755945297.git.hns@goldelico.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index efe02ad695a6..ad2d9ecf32a5 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1919,8 +1919,8 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); - if ((cache.flags & 0xff) == 0xff) - cache.flags = -ENODEV; /* read error */ + if (di->chip == BQ27000 && (cache.flags & 0xff) == 0xff) + cache.flags = -ENODEV; /* bq27000 hdq read error */ if (cache.flags >= 0) { cache.capacity = bq27xxx_battery_read_soc(di); From 03e79de4608bdd48ad6eec272e196124cefaf798 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 4 Sep 2025 11:13:34 +0200 Subject: [PATCH 122/450] net: fec: Fix possible NPD in fec_enet_phy_reset_after_clk_enable() The function of_phy_find_device may return NULL, so we need to take care before dereferencing phy_dev. Fixes: 64a632da538a ("net: fec: Fix phy_device lookup for phy_reset_after_clk_enable()") Signed-off-by: Stefan Wahren Cc: Christoph Niedermaier Cc: Richard Leitner Reviewed-by: Simon Horman Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250904091334.53965-1-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 1383918f8a3f..adf1f2bbcbb1 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2363,7 +2363,8 @@ static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) */ phy_dev = of_phy_find_device(fep->phy_node); phy_reset_after_clk_enable(phy_dev); - put_device(&phy_dev->mdio.dev); + if (phy_dev) + put_device(&phy_dev->mdio.dev); } } From 0ba5b2f2c381dbec9ed9e4ab3ae5d3e667de0dc3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Sep 2025 15:52:37 +0300 Subject: [PATCH 123/450] net: phylink: add lock for serializing concurrent pl->phydev writes with resolver Currently phylink_resolve() protects itself against concurrent phylink_bringup_phy() or phylink_disconnect_phy() calls which modify pl->phydev by relying on pl->state_mutex. The problem is that in phylink_resolve(), pl->state_mutex is in a lock inversion state with pl->phydev->lock. So pl->phydev->lock needs to be acquired prior to pl->state_mutex. But that requires dereferencing pl->phydev in the first place, and without pl->state_mutex, that is racy. Hence the reason for the extra lock. Currently it is redundant, but it will serve a functional purpose once mutex_lock(&phy->lock) will be moved outside of the mutex_lock(&pl->state_mutex) section. Another alternative considered would have been to let phylink_resolve() acquire the rtnl_mutex, which is also held when phylink_bringup_phy() and phylink_disconnect_phy() are called. But since phylink_disconnect_phy() runs under rtnl_lock(), it would deadlock with phylink_resolve() when calling flush_work(&pl->resolve). Additionally, it would have been undesirable because it would have unnecessarily blocked many other call paths as well in the entire kernel, so the smaller-scoped lock was preferred. Link: https://lore.kernel.org/netdev/aLb6puGVzR29GpPx@shell.armlinux.org.uk/ Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250904125238.193990-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index c7cb95aa8007..aa17ad2622fc 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -67,6 +67,8 @@ struct phylink { struct timer_list link_poll; struct mutex state_mutex; + /* Serialize updates to pl->phydev with phylink_resolve() */ + struct mutex phydev_mutex; struct phylink_link_state phy_state; unsigned int phy_ib_mode; struct work_struct resolve; @@ -1591,8 +1593,11 @@ static void phylink_resolve(struct work_struct *w) struct phylink_link_state link_state; bool mac_config = false; bool retrigger = false; + struct phy_device *phy; bool cur_link_state; + mutex_lock(&pl->phydev_mutex); + phy = pl->phydev; mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1626,11 +1631,11 @@ static void phylink_resolve(struct work_struct *w) /* If we have a phy, the "up" state is the union of both the * PHY and the MAC */ - if (pl->phydev) + if (phy) link_state.link &= pl->phy_state.link; /* Only update if the PHY link is up */ - if (pl->phydev && pl->phy_state.link) { + if (phy && pl->phy_state.link) { /* If the interface has changed, force a link down * event if the link isn't already down, and re-resolve. */ @@ -1694,6 +1699,7 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + mutex_unlock(&pl->phydev_mutex); } static void phylink_run_resolve(struct phylink *pl) @@ -1829,6 +1835,7 @@ struct phylink *phylink_create(struct phylink_config *config, if (!pl) return ERR_PTR(-ENOMEM); + mutex_init(&pl->phydev_mutex); mutex_init(&pl->state_mutex); INIT_WORK(&pl->resolve, phylink_resolve); @@ -2089,6 +2096,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, dev_name(&phy->mdio.dev), phy->drv->name, irq_str); kfree(irq_str); + mutex_lock(&pl->phydev_mutex); mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); pl->phydev = phy; @@ -2134,6 +2142,7 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); + mutex_unlock(&pl->phydev_mutex); phylink_dbg(pl, "phy: %s setting supported %*pb advertising %*pb\n", @@ -2312,6 +2321,7 @@ void phylink_disconnect_phy(struct phylink *pl) ASSERT_RTNL(); + mutex_lock(&pl->phydev_mutex); phy = pl->phydev; if (phy) { mutex_lock(&phy->lock); @@ -2321,8 +2331,11 @@ void phylink_disconnect_phy(struct phylink *pl) pl->mac_tx_clk_stop = false; mutex_unlock(&pl->state_mutex); mutex_unlock(&phy->lock); - flush_work(&pl->resolve); + } + mutex_unlock(&pl->phydev_mutex); + if (phy) { + flush_work(&pl->resolve); phy_disconnect(phy); } } From e2a10daba84968f6b5777d150985fd7d6abc9c84 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 4 Sep 2025 15:52:38 +0300 Subject: [PATCH 124/450] net: phy: transfer phy_config_inband() locking responsibility to phylink Problem description =================== Lockdep reports a possible circular locking dependency (AB/BA) between &pl->state_mutex and &phy->lock, as follows. phylink_resolve() // acquires &pl->state_mutex -> phylink_major_config() -> phy_config_inband() // acquires &pl->phydev->lock whereas all the other call sites where &pl->state_mutex and &pl->phydev->lock have the locking scheme reversed. Everywhere else, &pl->phydev->lock is acquired at the top level, and &pl->state_mutex at the lower level. A clear example is phylink_bringup_phy(). The outlier is the newly introduced phy_config_inband() and the existing lock order is the correct one. To understand why it cannot be the other way around, it is sufficient to consider phylink_phy_change(), phylink's callback from the PHY device's phy->phy_link_change() virtual method, invoked by the PHY state machine. phy_link_up() and phy_link_down(), the (indirect) callers of phylink_phy_change(), are called with &phydev->lock acquired. Then phylink_phy_change() acquires its own &pl->state_mutex, to serialize changes made to its pl->phy_state and pl->link_config. So all other instances of &pl->state_mutex and &phydev->lock must be consistent with this order. Problem impact ============== I think the kernel runs a serious deadlock risk if an existing phylink_resolve() thread, which results in a phy_config_inband() call, is concurrent with a phy_link_up() or phy_link_down() call, which will deadlock on &pl->state_mutex in phylink_phy_change(). Practically speaking, the impact may be limited by the slow speed of the medium auto-negotiation protocol, which makes it unlikely for the current state to still be unresolved when a new one is detected, but I think the problem is there. Nonetheless, the problem was discovered using lockdep. Proposed solution ================= Practically speaking, the phy_config_inband() requirement of having phydev->lock acquired must transfer to the caller (phylink is the only caller). There, it must bubble up until immediately before &pl->state_mutex is acquired, for the cases where that takes place. Solution details, considerations, notes ======================================= This is the phy_config_inband() call graph: sfp_upstream_ops :: connect_phy() | v phylink_sfp_connect_phy() | v phylink_sfp_config_phy() | | sfp_upstream_ops :: module_insert() | | | v | phylink_sfp_module_insert() | | | | sfp_upstream_ops :: module_start() | | | | | v | | phylink_sfp_module_start() | | | | v v | phylink_sfp_config_optical() phylink_start() | | | phylink_resume() v v | | phylink_sfp_set_config() | | | v v v phylink_mac_initial_config() | phylink_resolve() | | phylink_ethtool_ksettings_set() v v v phylink_major_config() | v phy_config_inband() phylink_major_config() caller #1, phylink_mac_initial_config(), does not acquire &pl->state_mutex nor do its callers. It must acquire &pl->phydev->lock prior to calling phylink_major_config(). phylink_major_config() caller #2, phylink_resolve() acquires &pl->state_mutex, thus also needs to acquire &pl->phydev->lock. phylink_major_config() caller #3, phylink_ethtool_ksettings_set(), is completely uninteresting, because it only calls phylink_major_config() if pl->phydev is NULL (otherwise it calls phy_ethtool_ksettings_set()). We need to change nothing there. Other solutions =============== The lock inversion between &pl->state_mutex and &pl->phydev->lock has occurred at least once before, as seen in commit c718af2d00a3 ("net: phylink: fix ethtool -A with attached PHYs"). The solution there was to simply not call phy_set_asym_pause() under the &pl->state_mutex. That cannot be extended to our case though, where the phy_config_inband() call is much deeper inside the &pl->state_mutex section. Fixes: 5fd0f1a02e75 ("net: phylink: add negotiation of in-band capabilities") Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250904125238.193990-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 12 ++++-------- drivers/net/phy/phylink.c | 9 +++++++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 13df28445f02..c02da57a4da5 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1065,23 +1065,19 @@ EXPORT_SYMBOL_GPL(phy_inband_caps); */ int phy_config_inband(struct phy_device *phydev, unsigned int modes) { - int err; + lockdep_assert_held(&phydev->lock); if (!!(modes & LINK_INBAND_DISABLE) + !!(modes & LINK_INBAND_ENABLE) + !!(modes & LINK_INBAND_BYPASS) != 1) return -EINVAL; - mutex_lock(&phydev->lock); if (!phydev->drv) - err = -EIO; + return -EIO; else if (!phydev->drv->config_inband) - err = -EOPNOTSUPP; - else - err = phydev->drv->config_inband(phydev, modes); - mutex_unlock(&phydev->lock); + return -EOPNOTSUPP; - return err; + return phydev->drv->config_inband(phydev, modes); } EXPORT_SYMBOL(phy_config_inband); diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index aa17ad2622fc..1988b7d2089a 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1434,6 +1434,7 @@ static void phylink_get_fixed_state(struct phylink *pl, static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) { struct phylink_link_state link_state; + struct phy_device *phy = pl->phydev; switch (pl->req_link_an_mode) { case MLO_AN_PHY: @@ -1457,7 +1458,11 @@ static void phylink_mac_initial_config(struct phylink *pl, bool force_restart) link_state.link = false; phylink_apply_manual_flow(pl, &link_state); + if (phy) + mutex_lock(&phy->lock); phylink_major_config(pl, force_restart, &link_state); + if (phy) + mutex_unlock(&phy->lock); } static const char *phylink_pause_to_str(int pause) @@ -1598,6 +1603,8 @@ static void phylink_resolve(struct work_struct *w) mutex_lock(&pl->phydev_mutex); phy = pl->phydev; + if (phy) + mutex_lock(&phy->lock); mutex_lock(&pl->state_mutex); cur_link_state = phylink_link_is_up(pl); @@ -1699,6 +1706,8 @@ static void phylink_resolve(struct work_struct *w) queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); + if (phy) + mutex_unlock(&phy->lock); mutex_unlock(&pl->phydev_mutex); } From 220a0ffde02f962c13bc752b01aa570b8c65a37b Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:04 +0300 Subject: [PATCH 125/450] xhci: dbc: decouple endpoint allocation from initialization Decouple allocation of endpoint ring buffer from initialization of the buffer, and initialization of endpoint context parts from from the rest of the contexts. It allows driver to clear up and reinitialize endpoint rings after disconnect without reallocating everything. This is a prerequisite for the next patch that prevents the transfer ring from filling up with cancelled (no-op) TRBs if a debug cable is reconnected several times without transferring anything. Cc: stable@vger.kernel.org Fixes: dfba2174dc42 ("usb: xhci: Add DbC support in xHCI driver") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-dbgcap.c | 71 ++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c index 06a2edb9e86e..d0faff233e3e 100644 --- a/drivers/usb/host/xhci-dbgcap.c +++ b/drivers/usb/host/xhci-dbgcap.c @@ -101,13 +101,34 @@ static u32 xhci_dbc_populate_strings(struct dbc_str_descs *strings) return string_length; } +static void xhci_dbc_init_ep_contexts(struct xhci_dbc *dbc) +{ + struct xhci_ep_ctx *ep_ctx; + unsigned int max_burst; + dma_addr_t deq; + + max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); + + /* Populate bulk out endpoint context: */ + ep_ctx = dbc_bulkout_ctx(dbc); + deq = dbc_bulkout_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); + + /* Populate bulk in endpoint context: */ + ep_ctx = dbc_bulkin_ctx(dbc); + deq = dbc_bulkin_enq(dbc); + ep_ctx->ep_info = 0; + ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); + ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); +} + static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) { struct dbc_info_context *info; - struct xhci_ep_ctx *ep_ctx; u32 dev_info; - dma_addr_t deq, dma; - unsigned int max_burst; + dma_addr_t dma; if (!dbc) return; @@ -121,20 +142,8 @@ static void xhci_dbc_init_contexts(struct xhci_dbc *dbc, u32 string_length) info->serial = cpu_to_le64(dma + DBC_MAX_STRING_LENGTH * 3); info->length = cpu_to_le32(string_length); - /* Populate bulk out endpoint context: */ - ep_ctx = dbc_bulkout_ctx(dbc); - max_burst = DBC_CTRL_MAXBURST(readl(&dbc->regs->control)); - deq = dbc_bulkout_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_OUT_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_out->cycle_state); - - /* Populate bulk in endpoint context: */ - ep_ctx = dbc_bulkin_ctx(dbc); - deq = dbc_bulkin_enq(dbc); - ep_ctx->ep_info = 0; - ep_ctx->ep_info2 = dbc_epctx_info2(BULK_IN_EP, 1024, max_burst); - ep_ctx->deq = cpu_to_le64(deq | dbc->ring_in->cycle_state); + /* Populate bulk in and out endpoint contexts: */ + xhci_dbc_init_ep_contexts(dbc); /* Set DbC context and info registers: */ lo_hi_writeq(dbc->ctx->dma, &dbc->regs->dccp); @@ -436,6 +445,23 @@ dbc_alloc_ctx(struct device *dev, gfp_t flags) return ctx; } +static void xhci_dbc_ring_init(struct xhci_ring *ring) +{ + struct xhci_segment *seg = ring->first_seg; + + /* clear all trbs on ring in case of old ring */ + memset(seg->trbs, 0, TRB_SEGMENT_SIZE); + + /* Only event ring does not use link TRB */ + if (ring->type != TYPE_EVENT) { + union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; + + trb->link.segment_ptr = cpu_to_le64(ring->first_seg->dma); + trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); + } + xhci_initialize_ring_info(ring); +} + static struct xhci_ring * xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) { @@ -464,15 +490,10 @@ xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) seg->dma = dma; - /* Only event ring does not use link TRB */ - if (type != TYPE_EVENT) { - union xhci_trb *trb = &seg->trbs[TRBS_PER_SEGMENT - 1]; - - trb->link.segment_ptr = cpu_to_le64(dma); - trb->link.control = cpu_to_le32(LINK_TOGGLE | TRB_TYPE(TRB_LINK)); - } INIT_LIST_HEAD(&ring->td_list); - xhci_initialize_ring_info(ring); + + xhci_dbc_ring_init(ring); + return ring; dma_fail: kfree(seg); From a5c98e8b1398534ae1feb6e95e2d3ee5215538ed Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:05 +0300 Subject: [PATCH 126/450] xhci: dbc: Fix full DbC transfer ring after several reconnects Pending requests will be flushed on disconnect, and the corresponding TRBs will be turned into No-op TRBs, which are ignored by the xHC controller once it starts processing the ring. If the USB debug cable repeatedly disconnects before ring is started then the ring will eventually be filled with No-op TRBs. No new transfers can be queued when the ring is full, and driver will print the following error message: "xhci_hcd 0000:00:14.0: failed to queue trbs" This is a normal case for 'in' transfers where TRBs are always enqueued in advance, ready to take on incoming data. If no data arrives, and device is disconnected, then ring dequeue will remain at beginning of the ring while enqueue points to first free TRB after last cancelled No-op TRB. s Solve this by reinitializing the rings when the debug cable disconnects and DbC is leaving the configured state. Clear the whole ring buffer and set enqueue and dequeue to the beginning of ring, and set cycle bit to its initial state. Cc: stable@vger.kernel.org Fixes: dfba2174dc42 ("usb: xhci: Add DbC support in xHCI driver") Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-dbgcap.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c index d0faff233e3e..63edf2d8f245 100644 --- a/drivers/usb/host/xhci-dbgcap.c +++ b/drivers/usb/host/xhci-dbgcap.c @@ -462,6 +462,25 @@ static void xhci_dbc_ring_init(struct xhci_ring *ring) xhci_initialize_ring_info(ring); } +static int xhci_dbc_reinit_ep_rings(struct xhci_dbc *dbc) +{ + struct xhci_ring *in_ring = dbc->eps[BULK_IN].ring; + struct xhci_ring *out_ring = dbc->eps[BULK_OUT].ring; + + if (!in_ring || !out_ring || !dbc->ctx) { + dev_warn(dbc->dev, "Can't re-init unallocated endpoints\n"); + return -ENODEV; + } + + xhci_dbc_ring_init(in_ring); + xhci_dbc_ring_init(out_ring); + + /* set ep context enqueue, dequeue, and cycle to initial values */ + xhci_dbc_init_ep_contexts(dbc); + + return 0; +} + static struct xhci_ring * xhci_dbc_ring_alloc(struct device *dev, enum xhci_ring_type type, gfp_t flags) { @@ -885,7 +904,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) dev_info(dbc->dev, "DbC cable unplugged\n"); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } @@ -895,7 +914,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc) writel(portsc, &dbc->regs->portsc); dbc->state = DS_ENABLED; xhci_dbc_flush_requests(dbc); - + xhci_dbc_reinit_ep_rings(dbc); return EVT_DISC; } From edcbe06453ddfde21f6aa763f7cab655f26133cc Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 2 Sep 2025 13:53:06 +0300 Subject: [PATCH 127/450] xhci: fix memory leak regression when freeing xhci vdev devices depth first Suspend-resume cycle test revealed a memory leak in 6.17-rc3 Turns out the slot_id race fix changes accidentally ends up calling xhci_free_virt_device() with an incorrect vdev parameter. The vdev variable was reused for temporary purposes right before calling xhci_free_virt_device(). Fix this by passing the correct vdev parameter. The slot_id race fix that caused this regression was targeted for stable, so this needs to be applied there as well. Fixes: 2eb03376151b ("usb: xhci: Fix slot_id resource race conflict") Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/linux-usb/20250829181354.4450-1-00107082@163.com Suggested-by: Michal Pecio Suggested-by: David Wang <00107082@163.com> Cc: stable@vger.kernel.org Tested-by: David Wang <00107082@163.com> Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250902105306.877476-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 81eaad87a3d9..c4a6544aa107 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -962,7 +962,7 @@ static void xhci_free_virt_devices_depth_first(struct xhci_hcd *xhci, int slot_i out: /* we are now at a leaf device */ xhci_debugfs_remove_slot(xhci, slot_id); - xhci_free_virt_device(xhci, vdev, slot_id); + xhci_free_virt_device(xhci, xhci->devs[slot_id], slot_id); } int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id, From 8d63c83d8eb922f6c316320f50c82fa88d099bea Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 25 Aug 2025 12:00:22 -0400 Subject: [PATCH 128/450] USB: gadget: dummy-hcd: Fix locking bug in RT-enabled kernels Yunseong Kim and the syzbot fuzzer both reported a problem in RT-enabled kernels caused by the way dummy-hcd mixes interrupt management and spin-locking. The pattern was: local_irq_save(flags); spin_lock(&dum->lock); ... spin_unlock(&dum->lock); ... // calls usb_gadget_giveback_request() local_irq_restore(flags); The code was written this way because usb_gadget_giveback_request() needs to be called with interrupts disabled and the private lock not held. While this pattern works fine in non-RT kernels, it's not good when RT is enabled. RT kernels handle spinlocks much like mutexes; in particular, spin_lock() may sleep. But sleeping is not allowed while local interrupts are disabled. To fix the problem, rewrite the code to conform to the pattern used elsewhere in dummy-hcd and other UDC drivers: spin_lock_irqsave(&dum->lock, flags); ... spin_unlock(&dum->lock); usb_gadget_giveback_request(...); spin_lock(&dum->lock); ... spin_unlock_irqrestore(&dum->lock, flags); This approach satisfies the RT requirements. Signed-off-by: Alan Stern Cc: stable Fixes: b4dbda1a22d2 ("USB: dummy-hcd: disable interrupts during req->complete") Reported-by: Yunseong Kim Closes: Reported-by: syzbot+8baacc4139f12fa77909@syzkaller.appspotmail.com Closes: Tested-by: syzbot+8baacc4139f12fa77909@syzkaller.appspotmail.com CC: Sebastian Andrzej Siewior CC: stable@vger.kernel.org Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/bb192ae2-4eee-48ee-981f-3efdbbd0d8f0@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 21dbfb0b3bac..1cefca660773 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -765,8 +765,7 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) if (!dum->driver) return -ESHUTDOWN; - local_irq_save(flags); - spin_lock(&dum->lock); + spin_lock_irqsave(&dum->lock, flags); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; @@ -776,15 +775,16 @@ static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) retval = 0; break; } - spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); + spin_unlock(&dum->lock); usb_gadget_giveback_request(_ep, _req); + spin_lock(&dum->lock); } - local_irq_restore(flags); + spin_unlock_irqrestore(&dum->lock, flags); return retval; } From f34bfcc77b18375a87091c289c2eb53c249787b4 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Thu, 21 Aug 2025 20:37:57 +0000 Subject: [PATCH 129/450] usb: typec: tcpm: properly deliver cable vdms to altmode drivers tcpm_handle_vdm_request delivers messages to the partner altmode or the cable altmode depending on the SVDM response type, which is incorrect. The partner or cable should be chosen based on the received message type instead. Also add this filter to ADEV_NOTIFY_USB_AND_QUEUE_VDM, which is used when the Enter Mode command is responded to by a NAK on SOP or SOP' and when the Exit Mode command is responded to by an ACK on SOP. Fixes: 7e7877c55eb1 ("usb: typec: tcpm: add alt mode enter/exit/vdm support for sop'") Cc: stable@vger.kernel.org Signed-off-by: RD Babiera Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250821203759.1720841-2-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 1f6fdfaa34bf..b2a568a5bc9b 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -2426,17 +2426,21 @@ static void tcpm_handle_vdm_request(struct tcpm_port *port, case ADEV_NONE: break; case ADEV_NOTIFY_USB_AND_QUEUE_VDM: - WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); - typec_altmode_vdm(adev, p[0], &p[1], cnt); + if (rx_sop_type == TCPC_TX_SOP_PRIME) { + typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); + } else { + WARN_ON(typec_altmode_notify(adev, TYPEC_STATE_USB, NULL)); + typec_altmode_vdm(adev, p[0], &p[1], cnt); + } break; case ADEV_QUEUE_VDM: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) + if (rx_sop_type == TCPC_TX_SOP_PRIME) typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt); else typec_altmode_vdm(adev, p[0], &p[1], cnt); break; case ADEV_QUEUE_VDM_SEND_EXIT_MODE_ON_FAIL: - if (response_tx_sop_type == TCPC_TX_SOP_PRIME) { + if (rx_sop_type == TCPC_TX_SOP_PRIME) { if (typec_cable_altmode_vdm(adev, TYPEC_PLUG_SOP_P, p[0], &p[1], cnt)) { int svdm_version = typec_get_cable_svdm_version( From 21d8525d2e061cde034277d518411b02eac764e2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Sep 2025 17:39:24 +0200 Subject: [PATCH 130/450] usb: gadget: midi2: Fix missing UMP group attributes initialization The gadget card driver forgot to call snd_ump_update_group_attrs() after adding FBs, and this leaves the UMP group attributes uninitialized. As a result, -ENODEV error is returned at opening a legacy rawmidi device as an inactive group. This patch adds the missing call to address the behavior above. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20250904153932.13589-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index 0a800ba53816..b351f9ae0fc5 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1599,6 +1599,7 @@ static int f_midi2_create_card(struct f_midi2 *midi2) strscpy(fb->info.name, ump_fb_name(b), sizeof(fb->info.name)); } + snd_ump_update_group_attrs(ump); } for (i = 0; i < midi2->num_eps; i++) { From 116e79c679a1530cf833d0ff3007061d7a716bd9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Sep 2025 15:32:34 +0200 Subject: [PATCH 131/450] usb: gadget: midi2: Fix MIDI2 IN EP max packet size The EP-IN of MIDI2 (altset 1) wasn't initialized in f_midi2_create_usb_configs() as it's an INT EP unlike others BULK EPs. But this leaves rather the max packet size unchanged no matter which speed is used, resulting in the very slow access. And the wMaxPacketSize values set there look legit for INT EPs, so let's initialize the MIDI2 EP-IN there for achieving the equivalent speed as well. Fixes: 8b645922b223 ("usb: gadget: Add support for USB MIDI 2.0 function driver") Cc: stable Signed-off-by: Takashi Iwai Link: https://lore.kernel.org/r/20250905133240.20966-1-tiwai@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi2.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c index b351f9ae0fc5..de16b02d857e 100644 --- a/drivers/usb/gadget/function/f_midi2.c +++ b/drivers/usb/gadget/function/f_midi2.c @@ -1737,9 +1737,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_HIGH: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(512); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(512); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(512); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(512); + } fallthrough; case USB_SPEED_FULL: midi1_in_eps = midi2_midi1_ep_in_descs; @@ -1748,9 +1751,12 @@ static int f_midi2_create_usb_configs(struct f_midi2 *midi2, case USB_SPEED_SUPER: midi2_midi1_ep_out_desc.wMaxPacketSize = cpu_to_le16(1024); midi2_midi1_ep_in_desc.wMaxPacketSize = cpu_to_le16(1024); - for (i = 0; i < midi2->num_eps; i++) + for (i = 0; i < midi2->num_eps; i++) { midi2_midi2_ep_out_desc[i].wMaxPacketSize = cpu_to_le16(1024); + midi2_midi2_ep_in_desc[i].wMaxPacketSize = + cpu_to_le16(1024); + } midi1_in_eps = midi2_midi1_ep_in_ss_descs; midi1_out_eps = midi2_midi1_ep_out_ss_descs; break; From b5e3277c0f1c3439dd02b58997c06201d0ee8dbf Mon Sep 17 00:00:00 2001 From: Harshit Shah Date: Tue, 2 Sep 2025 12:16:29 -0700 Subject: [PATCH 132/450] serial: xilinx_uartps: read reg size from DTS Current implementation uses `CDNS_UART_REGISTER_SPACE(0x1000)` for request_mem_region() and ioremap() in cdns_uart_request_port() API. The cadence/xilinx IP has register space defined from offset 0x0 to 0x48. It also mentions that the register map is defined as [6:0]. So, the upper region may/maynot be used based on the IP integration. In Axiado AX3000 SoC two UART instances are defined 0x100 apart. That is creating issue in some other instance due to overlap with addresses. Since, this address space is already being defined in the devicetree, use the same when requesting the register space. Fixes: 1f7055779001 ("arm64: dts: axiado: Add initial support for AX3000 SoC and eval board") Acked-by: Michal Simek Signed-off-by: Harshit Shah Link: https://lore.kernel.org/r/20250902-xilinx-uartps-reg-size-v3-1-d11cfa7258e3@axiado.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/xilinx_uartps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index fe457bf1e15b..a66b44d21fba 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -33,7 +33,6 @@ #define CDNS_UART_MINOR 0 /* works best with devtmpfs */ #define CDNS_UART_NR_PORTS 16 #define CDNS_UART_FIFO_SIZE 64 /* FIFO size */ -#define CDNS_UART_REGISTER_SPACE 0x1000 #define TX_TIMEOUT 500000 /* Rx Trigger level */ @@ -1098,15 +1097,15 @@ static int cdns_uart_verify_port(struct uart_port *port, */ static int cdns_uart_request_port(struct uart_port *port) { - if (!request_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE, + if (!request_mem_region(port->mapbase, port->mapsize, CDNS_UART_NAME)) { return -ENOMEM; } - port->membase = ioremap(port->mapbase, CDNS_UART_REGISTER_SPACE); + port->membase = ioremap(port->mapbase, port->mapsize); if (!port->membase) { dev_err(port->dev, "Unable to map registers\n"); - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); return -ENOMEM; } return 0; @@ -1121,7 +1120,7 @@ static int cdns_uart_request_port(struct uart_port *port) */ static void cdns_uart_release_port(struct uart_port *port) { - release_mem_region(port->mapbase, CDNS_UART_REGISTER_SPACE); + release_mem_region(port->mapbase, port->mapsize); iounmap(port->membase); port->membase = NULL; } @@ -1780,6 +1779,7 @@ static int cdns_uart_probe(struct platform_device *pdev) * and triggers invocation of the config_port() entry point. */ port->mapbase = res->start; + port->mapsize = resource_size(res); port->irq = irq; port->dev = &pdev->dev; port->uartclk = clk_get_rate(cdns_uart_data->uartclk); From ab1396af7595e7d49a3850481b24d7fe7cbdfd31 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 5 Sep 2025 22:06:18 -0700 Subject: [PATCH 133/450] trace/fgraph: Fix error handling Commit edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") added a call to unregister the PM notifier if register_ftrace_graph() failed. It does so unconditionally. However, the PM notifier is only registered with the first call to register_ftrace_graph(). If the first registration was successful and a subsequent registration failed, the notifier is now unregistered even if ftrace graphs are still registered. Fix the problem by only unregistering the PM notifier during error handling if there are no active fgraph registrations. Fixes: edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") Closes: https://lore.kernel.org/all/63b0ba5a-a928-438e-84f9-93028dd72e54@roeck-us.net/ Cc: Ye Weihua Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250906050618.2634078-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea8..1e3b32b1e82c 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1397,7 +1397,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } From c1628c00c4351dd0727ef7f670694f68d9e663d8 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Sat, 6 Sep 2025 11:56:10 +0800 Subject: [PATCH 134/450] tracing/osnoise: Fix null-ptr-deref in bitmap_parselist() A crash was observed with the following output: BUG: kernel NULL pointer dereference, address: 0000000000000010 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 2 UID: 0 PID: 92 Comm: osnoise_cpus Not tainted 6.17.0-rc4-00201-gd69eb204c255 #138 PREEMPT(voluntary) RIP: 0010:bitmap_parselist+0x53/0x3e0 Call Trace: osnoise_cpus_write+0x7a/0x190 vfs_write+0xf8/0x410 ? do_sys_openat2+0x88/0xd0 ksys_write+0x60/0xd0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f This issue can be reproduced by below code: fd=open("/sys/kernel/debug/tracing/osnoise/cpus", O_WRONLY); write(fd, "0-2", 0); When user pass 'count=0' to osnoise_cpus_write(), kmalloc() will return ZERO_SIZE_PTR (16) and cpulist_parse() treat it as a normal value, which trigger the null pointer dereference. Add check for the parameter 'count'. Cc: Cc: Cc: Link: https://lore.kernel.org/20250906035610.3880282-1-wangliang74@huawei.com Fixes: 17f89102fe23 ("tracing/osnoise: Allow arbitrarily long CPU string") Signed-off-by: Wang Liang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa64..337bc0eb5d71 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,6 +2322,9 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; + if (count < 1) + return 0; + buf = kmalloc(count, GFP_KERNEL); if (!buf) return -ENOMEM; From 5f9efb6b7667043527d377421af2070cc0aa2ecd Mon Sep 17 00:00:00 2001 From: Julien Massot Date: Fri, 5 Sep 2025 13:51:58 +0200 Subject: [PATCH 135/450] Input: mtk-pmic-keys - MT6359 has a specific release irq Support for MT6359 PMIC keys has been added recently. However, the key release event is not properly handled: only key press events are generated, leaving key states stuck in "pressed". This patch ensures that both key press and key release events are properly emitted by handling the release logic correctly. Introduce a 'key_release_irq' member to the 'mtk_pmic_regs' to identify the devices that have a separate irq for the release event. Fixes: bc25e6bf032e ("Input: mtk-pmic-keys - add support for MT6359 PMIC keys") Signed-off-by: Julien Massot Link: https://lore.kernel.org/r/20250905-radxa-nio-12-l-gpio-v3-1-40f11377fb55@collabora.com Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/mtk-pmic-keys.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c index 50e2e792c91d..c78d9f6d97c4 100644 --- a/drivers/input/keyboard/mtk-pmic-keys.c +++ b/drivers/input/keyboard/mtk-pmic-keys.c @@ -55,6 +55,7 @@ struct mtk_pmic_regs { const struct mtk_pmic_keys_regs keys_regs[MTK_PMIC_MAX_KEY_COUNT]; u32 pmic_rst_reg; u32 rst_lprst_mask; /* Long-press reset timeout bitmask */ + bool key_release_irq; }; static const struct mtk_pmic_regs mt6397_regs = { @@ -116,6 +117,7 @@ static const struct mtk_pmic_regs mt6358_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6358_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; static const struct mtk_pmic_regs mt6359_regs = { @@ -129,6 +131,7 @@ static const struct mtk_pmic_regs mt6359_regs = { MTK_PMIC_HOMEKEY_RST), .pmic_rst_reg = MT6359_TOP_RST_MISC, .rst_lprst_mask = MTK_PMIC_RST_DU_MASK, + .key_release_irq = true, }; struct mtk_pmic_keys_info { @@ -368,7 +371,7 @@ static int mtk_pmic_keys_probe(struct platform_device *pdev) if (keys->keys[index].irq < 0) return keys->keys[index].irq; - if (of_device_is_compatible(node, "mediatek,mt6358-keys")) { + if (mtk_pmic_regs->key_release_irq) { keys->keys[index].irq_r = platform_get_irq_byname(pdev, irqnames_r[index]); From 3c9ba2777d6c86025e1ba4186dc5cd930e40ec5f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 22 Aug 2025 07:07:14 +0000 Subject: [PATCH 136/450] kernfs: Fix UAF in polling when open file is released A use-after-free (UAF) vulnerability was identified in the PSI (Pressure Stall Information) monitoring mechanism: BUG: KASAN: slab-use-after-free in psi_trigger_poll+0x3c/0x140 Read of size 8 at addr ffff3de3d50bd308 by task systemd/1 psi_trigger_poll+0x3c/0x140 cgroup_pressure_poll+0x70/0xa0 cgroup_file_poll+0x8c/0x100 kernfs_fop_poll+0x11c/0x1c0 ep_item_poll.isra.0+0x188/0x2c0 Allocated by task 1: cgroup_file_open+0x88/0x388 kernfs_fop_open+0x73c/0xaf0 do_dentry_open+0x5fc/0x1200 vfs_open+0xa0/0x3f0 do_open+0x7e8/0xd08 path_openat+0x2fc/0x6b0 do_filp_open+0x174/0x368 Freed by task 8462: cgroup_file_release+0x130/0x1f8 kernfs_drain_open_files+0x17c/0x440 kernfs_drain+0x2dc/0x360 kernfs_show+0x1b8/0x288 cgroup_file_show+0x150/0x268 cgroup_pressure_write+0x1dc/0x340 cgroup_file_write+0x274/0x548 Reproduction Steps: 1. Open test/cpu.pressure and establish epoll monitoring 2. Disable monitoring: echo 0 > test/cgroup.pressure 3. Re-enable monitoring: echo 1 > test/cgroup.pressure The race condition occurs because: 1. When cgroup.pressure is disabled (echo 0 > cgroup.pressure), it: - Releases PSI triggers via cgroup_file_release() - Frees of->priv through kernfs_drain_open_files() 2. While epoll still holds reference to the file and continues polling 3. Re-enabling (echo 1 > cgroup.pressure) accesses freed of->priv epolling disable/enable cgroup.pressure fd=open(cpu.pressure) while(1) ... epoll_wait kernfs_fop_poll kernfs_get_active = true echo 0 > cgroup.pressure ... cgroup_file_show kernfs_show // inactive kn kernfs_drain_open_files cft->release(of); kfree(ctx); ... kernfs_get_active = false echo 1 > cgroup.pressure kernfs_show kernfs_activate_one(kn); kernfs_fop_poll kernfs_get_active = true cgroup_file_poll psi_trigger_poll // UAF ... end: close(fd) To address this issue, introduce kernfs_get_active_of() for kernfs open files to obtain active references. This function will fail if the open file has been released. Replace kernfs_get_active() with kernfs_get_active_of() to prevent further operations on released file descriptors. Fixes: 34f26a15611a ("sched/psi: Per-cgroup PSI accounting disable/re-enable interface") Cc: stable Reported-by: Zhang Zhaotian Signed-off-by: Chen Ridong Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250822070715.1565236-2-chenridong@huaweicloud.com Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 58 +++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index a6c692cac616..9adf36e6364b 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -70,6 +70,24 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) !list_empty(&of->list)); } +/* Get active reference to kernfs node for an open file */ +static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) +{ + /* Skip if file was already released */ + if (unlikely(of->released)) + return NULL; + + if (!kernfs_get_active(of->kn)) + return NULL; + + return of; +} + +static void kernfs_put_active_of(struct kernfs_open_file *of) +{ + return kernfs_put_active(of->kn); +} + /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * @@ -139,7 +157,7 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) if (ops->seq_stop) ops->seq_stop(sf, v); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) @@ -152,7 +170,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); @@ -238,7 +256,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; @@ -252,7 +270,7 @@ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) @@ -323,7 +341,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; @@ -335,7 +353,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) else len = -EINVAL; - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) @@ -357,13 +375,13 @@ static void kernfs_vma_open(struct vm_area_struct *vma) if (!of->vm_ops) return; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) @@ -375,14 +393,14 @@ static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -395,7 +413,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) if (!of->vm_ops) return VM_FAULT_SIGBUS; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; @@ -404,7 +422,7 @@ static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) else file_update_time(file); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -418,14 +436,14 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, if (!of->vm_ops) return -EINVAL; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); return ret; } @@ -455,7 +473,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) mutex_lock(&of->mutex); rc = -ENODEV; - if (!kernfs_get_active(of->kn)) + if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); @@ -490,7 +508,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) } vma->vm_ops = &kernfs_vm_ops; out_put: - kernfs_put_active(of->kn); + kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); @@ -852,7 +870,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; - if (!kernfs_get_active(kn)) + if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) @@ -860,7 +878,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) else ret = kernfs_generic_poll(of, wait); - kernfs_put_active(kn); + kernfs_put_active_of(of); return ret; } @@ -875,7 +893,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); - if (!kernfs_get_active(of->kn)) { + if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } @@ -886,7 +904,7 @@ static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) else ret = generic_file_llseek(file, offset, whence); - kernfs_put_active(of->kn); + kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } From d3684397ea9ba2edf02be0aa2b4dcab3bd74c503 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Aug 2025 19:29:55 -0400 Subject: [PATCH 137/450] nfs/localio: avoid bouncing LOCALIO if nfs_client_is_local() Previously nfs_local_probe() was made to disable and then attempt to re-enable LOCALIO (via LOCALIO protocol handshake) if/when it was called and LOCALIO already enabled. Vague memory for _why_ this was the case is that this was useful if/when a local NFS server were to be restarted with a local NFS client connected to it. But as it happens this causes an absurd amount of LOCALIO flapping which has a side-effect of too much IO being needlessly sent to NFSD (using RPC over the loopback network interface). This is the definition of "serious performance loss" (that negates the point of having LOCALIO). So remove this mis-optimization for re-enabling LOCALIO if/when an NFS server is restarted (which is an extremely rare thing to do). Will revisit testing that scenario again but in the meantime this patch restores the full benefit of LOCALIO. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index bdb82a19136a..97abf62f109d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -180,10 +180,8 @@ static void nfs_local_probe(struct nfs_client *clp) return; } - if (nfs_client_is_local(clp)) { - /* If already enabled, disable and re-enable */ - nfs_localio_disable_client(clp); - } + if (nfs_client_is_local(clp)) + return; if (!nfs_uuid_begin(&clp->cl_uuid)) return; @@ -244,7 +242,8 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, case -ENOMEM: case -ENXIO: case -ENOENT: - /* Revalidate localio, will disable if unsupported */ + /* Revalidate localio */ + nfs_localio_disable_client(clp); nfs_local_probe(clp); } } From 5a46d2339a5ae268ede53a221f20433d8ea4f2f9 Mon Sep 17 00:00:00 2001 From: Tigran Mkrtchyan Date: Thu, 28 Aug 2025 16:51:00 +0200 Subject: [PATCH 138/450] flexfiles/pNFS: fix NULL checks on result of ff_layout_choose_ds_for_read Recent commit f06bedfa62d5 ("pNFS/flexfiles: don't attempt pnfs on fatal DS errors") has changed the error return type of ff_layout_choose_ds_for_read() from NULL to an error pointer. However, not all code paths have been updated to match the change. Thus, some non-NULL checks will accept error pointers as a valid return value. Reported-by: Dan Carpenter Suggested-by: Dan Carpenter Fixes: f06bedfa62d5 ("pNFS/flexfiles: don't attempt pnfs on fatal DS errors") Signed-off-by: Tigran Mkrtchyan Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8dc921d83538..f8ab7b4e09e7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -773,8 +773,11 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + // reinitialize the error state in case if this is the last iteration + ds = ERR_PTR(-EINVAL); continue; + } *best_idx = idx; break; @@ -804,7 +807,7 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); - if (ds) + if (!IS_ERR(ds)) return ds; return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); } @@ -818,7 +821,7 @@ ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, best_idx); - if (ds || !pgio->pg_mirror_idx) + if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); } @@ -868,7 +871,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, req->wb_nio = 0; ds = ff_layout_get_ds_for_read(pgio, &ds_idx); - if (!ds) { + if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; pnfs_generic_pg_cleanup(pgio); @@ -1072,11 +1075,13 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + struct nfs4_pnfs_ds *ds; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) - ff_layout_send_layouterror(hdr->lseg); - else + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); + else + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr, new_idx); } From b1817b18ff20e69f5accdccefaf78bf5454bede2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Sep 2025 18:46:16 -0400 Subject: [PATCH 139/450] NFS: Protect against 'eof page pollution' This commit fixes the failing xfstest 'generic/363'. When the user mmaps() an area that extends beyond the end of file, and proceeds to write data into the folio that straddles that eof, we're required to discard that folio data if the user calls some function that extends the file length. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 33 +++++++++++++++++++++++++++++++++ fs/nfs/inode.c | 9 +++++++-- fs/nfs/internal.h | 2 ++ fs/nfs/nfs42proc.c | 14 +++++++++++--- fs/nfs/nfstrace.h | 1 + 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 86e36c630f09..a3105f944a0e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -280,6 +281,37 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL_GPL(nfs_file_fsync); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to) +{ + struct folio *folio; + + if (from >= to) + return; + + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + + if (folio_test_uptodate(folio)) { + loff_t fpos = folio_pos(folio); + size_t offset = from - fpos; + size_t end = folio_size(folio); + + if (to - fpos < end) + end = to - fpos; + folio_zero_segment(folio, offset, end); + trace_nfs_size_truncate_folio(mapping->host, to); + } + + folio_unlock(folio); + folio_put(folio); +} +EXPORT_SYMBOL_GPL(nfs_truncate_last_folio); + /* * Decide whether a read/modify/write cycle may be more efficient * then a modify/write/read cycle when writing to a page in the @@ -356,6 +388,7 @@ static int nfs_write_begin(const struct kiocb *iocb, dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); + nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); fgp |= fgf_set_order(len); start: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..0b141feacc52 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -716,6 +716,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; + loff_t oldsize = i_size_read(inode); int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); @@ -731,7 +732,7 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (error) return error; - if (attr->ia_size == i_size_read(inode)) + if (attr->ia_size == oldsize) attr->ia_valid &= ~ATTR_SIZE; } @@ -777,8 +778,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); - if (error == 0) + if (error == 0) { + if (attr->ia_valid & ATTR_SIZE) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + attr->ia_size); error = nfs_refresh_inode(inode, fattr); + } nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 74d712b58423..1433ae13dba0 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -437,6 +437,8 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); +void nfs_truncate_last_folio(struct address_space *mapping, loff_t from, + loff_t to); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 01c01f45358b..4420b8740e2f 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -137,6 +137,7 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) @@ -145,7 +146,11 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == -EOPNOTSUPP) + + if (err == 0) + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); + else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | NFS_CAP_ZERO_RANGE); @@ -183,6 +188,7 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], }; struct inode *inode = file_inode(filep); + loff_t oldsize = i_size_read(inode); int err; if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) @@ -191,9 +197,11 @@ int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); - if (err == 0) + if (err == 0) { + nfs_truncate_last_folio(inode->i_mapping, oldsize, + offset + len); truncate_pagecache_range(inode, offset, (offset + len) -1); - if (err == -EOPNOTSUPP) + } else if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 96b1323318c2..627115179795 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -272,6 +272,7 @@ DECLARE_EVENT_CLASS(nfs_update_size_class, TP_ARGS(inode, new_size)) DEFINE_NFS_UPDATE_SIZE_EVENT(truncate); +DEFINE_NFS_UPDATE_SIZE_EVENT(truncate_folio); DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); From b2036bb65114c01caf4a1afe553026e081703c8c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:25:35 -0400 Subject: [PATCH 140/450] NFSv4.2: Protect copy offload and clone against 'eof page pollution' The NFSv4.2 copy offload and clone functions can also end up extending the size of the destination file, so they too need to call nfs_truncate_last_folio(). Reported-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 4420b8740e2f..e2fea37c5348 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -362,22 +362,27 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst, /** * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload - * @inode: pointer to destination inode + * @file: pointer to destination file * @pos: destination offset * @len: copy length + * @oldsize: length of the file prior to clone/copy * * Punch a hole in the inode page cache, so that the NFS client will * know to retrieve new data. * Update the file size if necessary, and then mark the inode as having * invalid cached values for change attribute, ctime, mtime and space used. */ -static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +static void nfs42_copy_dest_done(struct file *file, loff_t pos, loff_t len, + loff_t oldsize) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; loff_t newsize = pos + len; loff_t end = newsize - 1; - WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + nfs_truncate_last_folio(mapping, oldsize, pos); + WARN_ON_ONCE(invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT)); spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) @@ -410,6 +415,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; + loff_t oldsize_dst = i_size_read(dst_inode); size_t count = args->count; ssize_t status; @@ -483,7 +489,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + nfs42_copy_dest_done(dst, pos_dst, res->write_res.count, oldsize_dst); nfs_invalidate_atime(src_inode); status = res->write_res.count; out: @@ -1250,6 +1256,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct nfs42_clone_res res = { .server = server, }; + loff_t oldsize_dst = i_size_read(dst_inode); int status; msg->rpc_argp = &args; @@ -1284,7 +1291,7 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, /* a zero-length count means clone to EOF in src */ if (count == 0 && res.dst_fattr->valid & NFS_ATTR_FATTR_SIZE) count = nfs_size_to_loff_t(res.dst_fattr->size) - dst_offset; - nfs42_copy_dest_done(dst_inode, dst_offset, count); + nfs42_copy_dest_done(dst_f, dst_offset, count, oldsize_dst); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); } From 9eb90f435415c7da4800974ed943e39b5578ee7f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Sep 2025 12:06:23 -0400 Subject: [PATCH 141/450] NFS: Serialise O_DIRECT i/o and truncate() Ensure that all O_DIRECT reads and writes are complete, and prevent the initiation of new i/o until the setattr operation that will truncate the file is complete. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 +++- fs/nfs/internal.h | 10 ++++++++++ fs/nfs/io.c | 13 ++----------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0b141feacc52..49df9debb1a6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -768,8 +768,10 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode)) { + nfs_file_block_o_direct(NFS_I(inode)); nfs_sync_inode(inode); + } fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fattr == NULL) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1433ae13dba0..c0a44f389f8f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -532,6 +532,16 @@ static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } +/* Must be called with exclusively locked inode->i_rwsem */ +static inline void nfs_file_block_o_direct(struct nfs_inode *nfsi) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(&nfsi->vfs_inode); + } +} + + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 3388faf2acb9..d275b0a250bf 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -14,15 +14,6 @@ #include "internal.h" -/* Call with exclusively locked inode->i_rwsem */ -static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) -{ - if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { - clear_bit(NFS_INO_ODIRECT, &nfsi->flags); - inode_dio_wait(inode); - } -} - /** * nfs_start_io_read - declare the file is being used for buffered reads * @inode: file inode @@ -57,7 +48,7 @@ nfs_start_io_read(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (err) return err; - nfs_block_o_direct(nfsi, inode); + nfs_file_block_o_direct(nfsi); downgrade_write(&inode->i_rwsem); return 0; @@ -90,7 +81,7 @@ nfs_start_io_write(struct inode *inode) err = down_write_killable(&inode->i_rwsem); if (!err) - nfs_block_o_direct(NFS_I(inode), inode); + nfs_file_block_o_direct(NFS_I(inode)); return err; } From b93128f29733af5d427a335978a19884c2c230e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Sep 2025 12:11:17 -0400 Subject: [PATCH 142/450] NFSv4.2: Serialise O_DIRECT i/o and fallocate() Ensure that all O_DIRECT reads and writes complete before calling fallocate so that we don't race w.r.t. attribute updates. Fixes: 99f237832243 ("NFSv4.2: Always flush out writes in nfs42_proc_fallocate()") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index e2fea37c5348..1a169372ca16 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -114,6 +114,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, exception.inode = inode; exception.state = lock->open_context->state; + nfs_file_block_o_direct(NFS_I(inode)); err = nfs_sync_inode(inode); if (err) goto out; From c80ebeba1198eac8811ab0dba36ecc13d51e4438 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:40:24 -0400 Subject: [PATCH 143/450] NFSv4.2: Serialise O_DIRECT i/o and clone range Ensure that all O_DIRECT reads and writes complete before cloning a file range, so that both the source and destination are up to date. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1d6b5f4230c9..c9a0d1e420c6 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -278,9 +278,11 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, lock_two_nondirectories(src_inode, dst_inode); /* flush all pending writes on both src and dst so that server * has the latest data */ + nfs_file_block_o_direct(NFS_I(src_inode)); ret = nfs_sync_inode(src_inode); if (ret) goto out_unlock; + nfs_file_block_o_direct(NFS_I(dst_inode)); ret = nfs_sync_inode(dst_inode); if (ret) goto out_unlock; From ca247c89900ae90207f4d321e260cd93b7c7d104 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 10:54:13 -0400 Subject: [PATCH 144/450] NFSv4.2: Serialise O_DIRECT i/o and copy range Ensure that all O_DIRECT reads and writes complete before copying a file range, so that the destination is up to date. Fixes: a5864c999de6 ("NFS: Do not serialise O_DIRECT reads and writes") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1a169372ca16..6a0b5871ba3b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -445,6 +445,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, return status; } + nfs_file_block_o_direct(NFS_I(dst_inode)); status = nfs_sync_inode(dst_inode); if (status) return status; From b7b8574225e9d2b5f1fb5483886ab797892f43b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 11:48:57 -0400 Subject: [PATCH 145/450] NFS: nfs_invalidate_folio() must observe the offset and size arguments If we're truncating part of the folio, then we need to write out the data on the part that is not covered by the cancellation. Fixes: d47992f86b30 ("mm: change invalidatepage prototype to accept length") Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 7 ++++--- fs/nfs/write.c | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index a3105f944a0e..8059ece82468 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -475,10 +475,11 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset, dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); - if (offset != 0 || length < folio_size(folio)) - return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(inode, folio); + if (offset != 0 || length < folio_size(folio)) + nfs_wb_folio(inode, folio); + else + nfs_wb_folio_cancel(inode, folio); folio_wait_private_2(folio); /* [DEPRECATED] */ trace_nfs_invalidate_folio(inode, folio_pos(folio) + offset, length); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 8b7c04737967..e359fbcdc8a0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2045,6 +2045,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) * release it */ nfs_inode_remove_request(req); nfs_unlock_and_release_request(req); + folio_cancel_dirty(folio); } return ret; From c12b6a7b12a13ccd3aece6be09345c1944e18d3e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 20:11:03 -0400 Subject: [PATCH 146/450] NFS: Fix the marking of the folio as up to date Since all callers of nfs_page_group_covers_page() have already ensured that there is only one group member, all that is required is to check that the entire folio contains dirty data. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 52 +++++--------------------------------------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e359fbcdc8a0..647c53d1418a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -237,59 +237,17 @@ static void nfs_mapping_set_error(struct folio *folio, int error) } /* - * nfs_page_group_search_locked - * @head - head request of page group - * @page_offset - offset into page + * nfs_page_covers_folio + * @req: struct nfs_page * - * Search page group with head @head to find a request that contains the - * page offset @page_offset. - * - * Returns a pointer to the first matching nfs request, or NULL if no - * match is found. - * - * Must be called with the page group lock held - */ -static struct nfs_page * -nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) -{ - struct nfs_page *req; - - req = head; - do { - if (page_offset >= req->wb_pgbase && - page_offset < (req->wb_pgbase + req->wb_bytes)) - return req; - - req = req->wb_this_page; - } while (req != head); - - return NULL; -} - -/* - * nfs_page_group_covers_page - * @head - head request of page group - * - * Return true if the page group with head @head covers the whole page, - * returns false otherwise + * Return true if the request covers the whole folio. + * Note that the caller should ensure all subrequests have been joined */ static bool nfs_page_group_covers_page(struct nfs_page *req) { unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); - struct nfs_page *tmp; - unsigned int pos = 0; - nfs_page_group_lock(req); - - for (;;) { - tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (!tmp) - break; - pos = tmp->wb_pgbase + tmp->wb_bytes; - } - - nfs_page_group_unlock(req); - return pos >= len; + return req->wb_pgbase == 0 && req->wb_bytes == len; } /* We can set the PG_uptodate flag if we see that a write request From 199cd9e8d14bc14bdbd1fa3031ce26dac9781507 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 09:49:33 -0400 Subject: [PATCH 147/450] Revert "SUNRPC: Don't allow waiting for exiting tasks" This reverts commit 14e41b16e8cb677bb440dca2edba8b041646c742. This patch breaks the LTP acct02 test, so let's revert and look for a better solution. Reported-by: Mark Brown Reported-by: Harshvardhan Jha Link: https://lore.kernel.org/linux-nfs/7d4d57b0-39a3-49f1-8ada-60364743e3b4@sirena.org.uk/ Cc: stable@vger.kernel.org # 6.15.x Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5..9b45fbdc90ca 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (unlikely(current->flags & PF_EXITING)) - return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; From 9559d2fffd4f9b892165eed48198a0e5cb8504e6 Mon Sep 17 00:00:00 2001 From: Justin Worrell Date: Thu, 4 Sep 2025 16:09:57 -0500 Subject: [PATCH 148/450] SUNRPC: call xs_sock_process_cmsg for all cmsg xs_sock_recv_cmsg was failing to call xs_sock_process_cmsg for any cmsg type other than TLS_RECORD_TYPE_ALERT (TLS_RECORD_TYPE_DATA, and other values not handled.) Based on my reading of the previous commit (cc5d5908: sunrpc: fix client side handling of tls alerts), it looks like only iov_iter_revert should be conditional on TLS_RECORD_TYPE_ALERT (but that other cmsg types should still call xs_sock_process_cmsg). On my machine, I was unable to connect (over mtls) to an NFS share hosted on FreeBSD. With this patch applied, I am able to mount the share again. Fixes: cc5d59081fa2 ("sunrpc: fix client side handling of tls alerts") Signed-off-by: Justin Worrell Reviewed-and-tested-by: Scott Mayhew Link: https://lore.kernel.org/r/20250904211038.12874-3-jworrell@gmail.com Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775f..3aa987e7f072 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } From 71d2893a235bf3b95baccead27b3d47f2f2cdc4c Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Mon, 8 Sep 2025 06:27:27 +0800 Subject: [PATCH 149/450] ALSA: hda/tas2781: Fix the order of TAS2781 calibrated-data A bug reported by one of my customers that the order of TAS2781 calibrated-data is incorrect, the correct way is to move R0_Low and insert it between R0 and InvR0. Fixes: 4fe238513407 ("ALSA: hda/tas2781: Move and unified the calibrated-data getting function for SPI and I2C into the tas2781_hda lib") Signed-off-by: Shenghao Ding Link: https://patch.msgid.link/20250907222728.988-1-shenghao-ding@ti.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/tas2781_hda.c | 25 +++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c index f46d2e06c64f..536940c78f00 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda.c @@ -33,6 +33,23 @@ const efi_guid_t tasdev_fct_efi_guid[] = { }; EXPORT_SYMBOL_NS_GPL(tasdev_fct_efi_guid, "SND_HDA_SCODEC_TAS2781"); +/* + * The order of calibrated-data writing function is a bit different from the + * order in UEFI. Here is the conversion to match the order of calibrated-data + * writing function. + */ +static void cali_cnv(unsigned char *data, unsigned int base, int offset) +{ + struct cali_reg reg_data; + + memcpy(®_data, &data[base], sizeof(reg_data)); + /* the data order has to be swapped between r0_low_reg and inv0_reg */ + swap(reg_data.r0_low_reg, reg_data.invr0_reg); + + cpu_to_be32_array((__force __be32 *)(data + offset + 1), + (u32 *)®_data, TASDEV_CALIB_N); +} + static void tas2781_apply_calib(struct tasdevice_priv *p) { struct calidata *cali_data = &p->cali_data; @@ -103,8 +120,7 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) data[l] = k; oft++; - for (i = 0; i < TASDEV_CALIB_N * 4; i++) - data[l + i + 1] = data[4 * oft + i]; + cali_cnv(data, 4 * oft, l); k++; } } @@ -130,9 +146,8 @@ static void tas2781_apply_calib(struct tasdevice_priv *p) for (j = p->ndev - 1; j >= 0; j--) { l = j * (cali_data->cali_dat_sz_per_dev + 1); - for (i = TASDEV_CALIB_N * 4; i > 0 ; i--) - data[l + i] = data[p->index * 5 + i]; - data[l+i] = j; + cali_cnv(data, cali_data->cali_dat_sz_per_dev * j, l); + data[l] = j; } } From cba4262a19afae21665ee242b3404bcede5a94d7 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:15 +0000 Subject: [PATCH 150/450] x86/cpu/topology: Always try cpu_parse_topology_ext() on AMD/Hygon Support for parsing the topology on AMD/Hygon processors using CPUID leaf 0xb was added in 3986a0a805e6 ("x86/CPU/AMD: Derive CPU topology from CPUID function 0xB when available"). In an effort to keep all the topology parsing bits in one place, this commit also introduced a pseudo dependency on the TOPOEXT feature to parse the CPUID leaf 0xb. The TOPOEXT feature (CPUID 0x80000001 ECX[22]) advertises the support for Cache Properties leaf 0x8000001d and the CPUID leaf 0x8000001e EAX for "Extended APIC ID" however support for 0xb was introduced alongside the x2APIC support not only on AMD [1], but also historically on x86 [2]. Similar to 0xb, the support for extended CPU topology leaf 0x80000026 too does not depend on the TOPOEXT feature. The support for these leaves is expected to be confirmed by ensuring leaf <= {extended_}cpuid_level and then parsing the level 0 of the respective leaf to confirm EBX[15:0] (LogProcAtThisLevel) is non-zero as stated in the definition of "CPUID_Fn0000000B_EAX_x00 [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnumEax0)" in Processor Programming Reference (PPR) for AMD Family 19h Model 01h Rev B1 Vol1 [3] Sec. 2.1.15.1 "CPUID Instruction Functions". This has not been a problem on baremetal platforms since support for TOPOEXT (Fam 0x15 and later) predates the support for CPUID leaf 0xb (Fam 0x17[Zen2] and later), however, for AMD guests on QEMU, the "x2apic" feature can be enabled independent of the "topoext" feature where QEMU expects topology and the initial APICID to be parsed using the CPUID leaf 0xb (especially when number of cores > 255) which is populated independent of the "topoext" feature flag. Unconditionally call cpu_parse_topology_ext() on AMD and Hygon processors to first parse the topology using the XTOPOLOGY leaves (0x80000026 / 0xb) before using the TOPOEXT leaf (0x8000001e). While at it, break down the single large comment in parse_topology_amd() to better highlight the purpose of each CPUID leaf. Fixes: 3986a0a805e6 ("x86/CPU/AMD: Derive CPU topology from CPUID function 0xB when available") Suggested-by: Naveen N Rao (AMD) Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org # Only v6.9 and above; depends on x86 topology rewrite Link: https://lore.kernel.org/lkml/1529686927-7665-1-git-send-email-suravee.suthikulpanit@amd.com/ [1] Link: https://lore.kernel.org/lkml/20080818181435.523309000@linux-os.sc.intel.com/ [2] Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 [3] --- arch/x86/kernel/cpu/topology_amd.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 827dd0dbb6e9..c79ebbb639cb 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -175,27 +175,30 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { - bool has_topoext = false; - /* - * If the extended topology leaf 0x8000_001e is available - * try to get SMT, CORE, TILE, and DIE shifts from extended + * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to - * get SMT and CORE shift from leaf 0xb first, then try to - * get the CORE shift from leaf 0x8000_0008. + * get SMT and CORE shift from leaf 0xb. If either leaf is + * available, cpu_parse_topology_ext() will return true. */ - if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_topoext = cpu_parse_topology_ext(tscan); + bool has_xtopology = cpu_parse_topology_ext(tscan); if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - if (!has_topoext && !parse_8000_0008(tscan)) + /* + * If XTOPOLOGY leaves (0x26/0xb) are not available, try to + * get the CORE shift from leaf 0x8000_0008 first. + */ + if (!has_xtopology && !parse_8000_0008(tscan)) return; - /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_topoext)) + /* + * Prefer leaf 0x8000001e if available to get the SMT shift and + * the initial APIC ID if XTOPOLOGY leaves are not available. + */ + if (parse_8000_001e(tscan, has_xtopology)) return; /* Try the NODEID MSR */ From 77b8e6fbf9848d651f5cb7508f18ad0971f3ffdb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 8 Sep 2025 15:52:02 +0200 Subject: [PATCH 151/450] dm-integrity: limit MAX_TAG_SIZE to 255 MAX_TAG_SIZE was 0x1a8 and it may be truncated in the "bi->metadata_size = ic->tag_size" assignment. We need to limit it to 255. Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index efeee0a873c0..ab96b692e5a3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -133,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 From 440cec4ca1c242d72e309a801995584a55af25c6 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 18 Jul 2025 18:50:58 +0530 Subject: [PATCH 152/450] drm/amdgpu: Wait for bootloader after PSPv11 reset Some PSPv11 SOCs take a longer time for PSP based mode-1 reset. Instead of checking for C2PMSG_33 status, add the callback wait_for_bootloader. Wait for bootloader to be back to steady state is already part of the generic mode-1 reset flow. Increase the retry count for bootloader wait and also fix the mask to prevent fake pass. Fixes: 8345a71fc54b ("drm/amdgpu: Add more checks to PSP mailbox") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4531 Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 32f73741d6ee41fd5db8791c1163931e313d0fdc) --- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 6cc05d36e359..64b240b51f1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -149,12 +149,12 @@ static int psp_v11_0_wait_for_bootloader(struct psp_context *psp) int ret; int retry_loop; - for (retry_loop = 0; retry_loop < 10; retry_loop++) { + for (retry_loop = 0; retry_loop < 20; retry_loop++) { /* Wait for bootloader to signify that is ready having bit 31 of C2PMSG_35 set to 1 */ ret = psp_wait_for( psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_35), - 0x80000000, 0x80000000, PSP_WAITREG_NOVERBOSE); + 0x80000000, 0x8000FFFF, PSP_WAITREG_NOVERBOSE); if (ret == 0) return 0; @@ -397,18 +397,6 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp) msleep(500); - offset = SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_33); - - ret = psp_wait_for(psp, offset, MBOX_TOS_RESP_FLAG, MBOX_TOS_RESP_MASK, - 0); - - if (ret) { - DRM_INFO("psp mode 1 reset failed!\n"); - return -EINVAL; - } - - DRM_INFO("psp mode1 reset succeed \n"); - return 0; } @@ -665,7 +653,8 @@ static const struct psp_funcs psp_v11_0_funcs = { .ring_get_wptr = psp_v11_0_ring_get_wptr, .ring_set_wptr = psp_v11_0_ring_set_wptr, .load_usbc_pd_fw = psp_v11_0_load_usbc_pd_fw, - .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw + .read_usbc_pd_fw = psp_v11_0_read_usbc_pd_fw, + .wait_for_bootloader = psp_v11_0_wait_for_bootloader }; void psp_v11_0_set_psp_funcs(struct psp_context *psp) From c05d0b32eebadc8be6e53196e99c64cf2bed1d99 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Sat, 6 Sep 2025 11:09:13 +0200 Subject: [PATCH 153/450] regulator: sy7636a: fix lifecycle of power good gpio Attach the power good gpio to the regulator device devres instead of the parent device to fix problems if probe is run multiple times (rmmod/insmod or some deferral). Fixes: 8c485bedfb785 ("regulator: sy7636a: Initial commit") Signed-off-by: Andreas Kemnade Reviewed-by: Alistair Francis Reviewed-by: Peng Fan Message-ID: <20250906-sy7636-rsrc-v1-2-e2886a9763a7@kernel.org> Signed-off-by: Mark Brown --- drivers/regulator/sy7636a-regulator.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/sy7636a-regulator.c b/drivers/regulator/sy7636a-regulator.c index d1e7ba1fb3e1..27e3d939b7bb 100644 --- a/drivers/regulator/sy7636a-regulator.c +++ b/drivers/regulator/sy7636a-regulator.c @@ -83,9 +83,11 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) if (!regmap) return -EPROBE_DEFER; - gdp = devm_gpiod_get(pdev->dev.parent, "epd-pwr-good", GPIOD_IN); + device_set_of_node_from_dev(&pdev->dev, pdev->dev.parent); + + gdp = devm_gpiod_get(&pdev->dev, "epd-pwr-good", GPIOD_IN); if (IS_ERR(gdp)) { - dev_err(pdev->dev.parent, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); + dev_err(&pdev->dev, "Power good GPIO fault %ld\n", PTR_ERR(gdp)); return PTR_ERR(gdp); } @@ -105,7 +107,6 @@ static int sy7636a_regulator_probe(struct platform_device *pdev) } config.dev = &pdev->dev; - config.dev->of_node = pdev->dev.parent->of_node; config.regmap = regmap; rdev = devm_regulator_register(&pdev->dev, &desc, &config); From 4b66d18918f8e4d85e51974a9e3ce9abad5c7c3d Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Mon, 8 Sep 2025 09:50:25 +0800 Subject: [PATCH 154/450] wifi: ath12k: Fix missing station power save configuration Commit afbab6e4e88d ("wifi: ath12k: modify ath12k_mac_op_bss_info_changed() for MLO") replaced the bss_info_changed() callback with vif_cfg_changed() and link_info_changed() to support Multi-Link Operation (MLO). As a result, the station power save configuration is no longer correctly applied in ath12k_mac_bss_info_changed(). Move the handling of 'BSS_CHANGED_PS' into ath12k_mac_op_vif_cfg_changed() to align with the updated callback structure introduced for MLO, ensuring proper power-save behavior for station interfaces. Tested-on: WCN7850 hw2.0 PCI WLAN.IOE_HMT.1.1-00011-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Fixes: afbab6e4e88d ("wifi: ath12k: modify ath12k_mac_op_bss_info_changed() for MLO") Signed-off-by: Miaoqing Pan Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20250908015025.1301398-1-miaoqing.pan@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 122 ++++++++++++++------------ 1 file changed, 67 insertions(+), 55 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index bd1ec3b2c084..3a3965b79942 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4078,12 +4078,68 @@ static int ath12k_mac_fils_discovery(struct ath12k_link_vif *arvif, return ret; } +static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) +{ + struct ath12k *ar = arvif->ar; + struct ieee80211_vif *vif = arvif->ahvif->vif; + struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; + enum wmi_sta_powersave_param param; + struct ieee80211_bss_conf *info; + enum wmi_sta_ps_mode psmode; + int ret; + int timeout; + bool enable_ps; + + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + if (vif->type != NL80211_IFTYPE_STATION) + return; + + enable_ps = arvif->ahvif->ps; + if (enable_ps) { + psmode = WMI_STA_PS_MODE_ENABLED; + param = WMI_STA_PS_PARAM_INACTIVITY_TIME; + + timeout = conf->dynamic_ps_timeout; + if (timeout == 0) { + info = ath12k_mac_get_link_bss_conf(arvif); + if (!info) { + ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", + vif->addr, arvif->link_id); + return; + } + + /* firmware doesn't like 0 */ + timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; + } + + ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, + timeout); + if (ret) { + ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", + arvif->vdev_id, ret); + return; + } + } else { + psmode = WMI_STA_PS_MODE_DISABLED; + } + + ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", + arvif->vdev_id, psmode ? "enable" : "disable"); + + ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); + if (ret) + ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", + psmode, arvif->vdev_id, ret); +} + static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u64 changed) { struct ath12k_vif *ahvif = ath12k_vif_to_ahvif(vif); unsigned long links = ahvif->links_map; + struct ieee80211_vif_cfg *vif_cfg; struct ieee80211_bss_conf *info; struct ath12k_link_vif *arvif; struct ieee80211_sta *sta; @@ -4147,61 +4203,24 @@ static void ath12k_mac_op_vif_cfg_changed(struct ieee80211_hw *hw, } } } -} -static void ath12k_mac_vif_setup_ps(struct ath12k_link_vif *arvif) -{ - struct ath12k *ar = arvif->ar; - struct ieee80211_vif *vif = arvif->ahvif->vif; - struct ieee80211_conf *conf = &ath12k_ar_to_hw(ar)->conf; - enum wmi_sta_powersave_param param; - struct ieee80211_bss_conf *info; - enum wmi_sta_ps_mode psmode; - int ret; - int timeout; - bool enable_ps; + if (changed & BSS_CHANGED_PS) { + links = ahvif->links_map; + vif_cfg = &vif->cfg; - lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + for_each_set_bit(link_id, &links, IEEE80211_MLD_MAX_NUM_LINKS) { + arvif = wiphy_dereference(hw->wiphy, ahvif->link[link_id]); + if (!arvif || !arvif->ar) + continue; - if (vif->type != NL80211_IFTYPE_STATION) - return; + ar = arvif->ar; - enable_ps = arvif->ahvif->ps; - if (enable_ps) { - psmode = WMI_STA_PS_MODE_ENABLED; - param = WMI_STA_PS_PARAM_INACTIVITY_TIME; - - timeout = conf->dynamic_ps_timeout; - if (timeout == 0) { - info = ath12k_mac_get_link_bss_conf(arvif); - if (!info) { - ath12k_warn(ar->ab, "unable to access bss link conf in setup ps for vif %pM link %u\n", - vif->addr, arvif->link_id); - return; + if (ar->ab->hw_params->supports_sta_ps) { + ahvif->ps = vif_cfg->ps; + ath12k_mac_vif_setup_ps(arvif); } - - /* firmware doesn't like 0 */ - timeout = ieee80211_tu_to_usec(info->beacon_int) / 1000; } - - ret = ath12k_wmi_set_sta_ps_param(ar, arvif->vdev_id, param, - timeout); - if (ret) { - ath12k_warn(ar->ab, "failed to set inactivity time for vdev %d: %i\n", - arvif->vdev_id, ret); - return; - } - } else { - psmode = WMI_STA_PS_MODE_DISABLED; } - - ath12k_dbg(ar->ab, ATH12K_DBG_MAC, "mac vdev %d psmode %s\n", - arvif->vdev_id, psmode ? "enable" : "disable"); - - ret = ath12k_wmi_pdev_set_ps_mode(ar, arvif->vdev_id, psmode); - if (ret) - ath12k_warn(ar->ab, "failed to set sta power save mode %d for vdev %d: %d\n", - psmode, arvif->vdev_id, ret); } static bool ath12k_mac_supports_tpc(struct ath12k *ar, struct ath12k_vif *ahvif, @@ -4223,7 +4242,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, { struct ath12k_vif *ahvif = arvif->ahvif; struct ieee80211_vif *vif = ath12k_ahvif_to_vif(ahvif); - struct ieee80211_vif_cfg *vif_cfg = &vif->cfg; struct cfg80211_chan_def def; u32 param_id, param_value; enum nl80211_band band; @@ -4510,12 +4528,6 @@ static void ath12k_mac_bss_info_changed(struct ath12k *ar, } ath12k_mac_fils_discovery(arvif, info); - - if (changed & BSS_CHANGED_PS && - ar->ab->hw_params->supports_sta_ps) { - ahvif->ps = vif_cfg->ps; - ath12k_mac_vif_setup_ps(arvif); - } } static struct ath12k_vif_cache *ath12k_ahvif_get_link_cache(struct ath12k_vif *ahvif, From 82e2be57d544ff9ad4696c85600827b39be8ce9e Mon Sep 17 00:00:00 2001 From: Miaoqing Pan Date: Mon, 8 Sep 2025 09:51:39 +0800 Subject: [PATCH 155/450] wifi: ath12k: fix WMI TLV header misalignment When buf_len is not 4-byte aligned in ath12k_wmi_mgmt_send(), the firmware asserts and triggers a recovery. The following error messages are observed: ath12k_pci 0004:01:00.0: failed to submit WMI_MGMT_TX_SEND_CMDID cmd ath12k_pci 0004:01:00.0: failed to send mgmt frame: -108 ath12k_pci 0004:01:00.0: failed to tx mgmt frame, vdev_id 0 :-108 ath12k_pci 0004:01:00.0: waiting recovery start... This issue was observed when running 'iw wlanx set power_save off/on' in MLO station mode, which triggers the sending of an SMPS action frame with a length of 27 bytes to the AP. To resolve the misalignment, use buf_len_aligned instead of buf_len when constructing the WMI TLV header. Tested-on: WCN7850 hw2.0 PCI WLAN.IOE_HMT.1.1-00011-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1 Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices") Signed-off-by: Miaoqing Pan Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20250908015139.1301437-1-miaoqing.pan@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 742ffeb48bce..29dadedefdd2 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -843,7 +843,7 @@ int ath12k_wmi_mgmt_send(struct ath12k_link_vif *arvif, u32 buf_id, cmd->tx_params_valid = 0; frame_tlv = (struct wmi_tlv *)(skb->data + sizeof(*cmd)); - frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len); + frame_tlv->header = ath12k_wmi_tlv_hdr(WMI_TAG_ARRAY_BYTE, buf_len_aligned); memcpy(frame_tlv->value, frame->data, buf_len); From dd2fa82473453661d12723c46c9f43d9876a7efd Mon Sep 17 00:00:00 2001 From: Jonathan Curley Date: Mon, 8 Sep 2025 17:35:16 +0000 Subject: [PATCH 156/450] NFSv4/flexfiles: Fix layout merge mirror check. Typo in ff_lseg_match_mirrors makes the diff ineffective. This results in merge happening all the time. Merge happening all the time is problematic because it marks lsegs invalid. Marking lsegs invalid causes all outstanding IO to get restarted with EAGAIN and connections to get closed. Closing connections constantly triggers race conditions in the RDMA implementation... Fixes: 660d1eb22301c ("pNFS/flexfile: Don't merge layout segments if the mirrors don't match") Signed-off-by: Jonathan Curley Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f8ab7b4e09e7..9edb5f9b0c4e 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -293,7 +293,7 @@ ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, struct pnfs_layout_segment *l2) { const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); - const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l2); u32 i; if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) From cd4453c5e983cf1fd5757e9acb915adb1e4602b6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 8 Sep 2025 02:46:58 +0000 Subject: [PATCH 157/450] tracing: Silence warning when chunk allocation fails in trace_pid_write Syzkaller trigger a fault injection warning: WARNING: CPU: 1 PID: 12326 at tracepoint_add_func+0xbfc/0xeb0 Modules linked in: CPU: 1 UID: 0 PID: 12326 Comm: syz.6.10325 Tainted: G U 6.14.0-rc5-syzkaller #0 Tainted: [U]=USER Hardware name: Google Compute Engine/Google Compute Engine RIP: 0010:tracepoint_add_func+0xbfc/0xeb0 kernel/tracepoint.c:294 Code: 09 fe ff 90 0f 0b 90 0f b6 74 24 43 31 ff 41 bc ea ff ff ff RSP: 0018:ffffc9000414fb48 EFLAGS: 00010283 RAX: 00000000000012a1 RBX: ffffffff8e240ae0 RCX: ffffc90014b78000 RDX: 0000000000080000 RSI: ffffffff81bbd78b RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffffffffef R13: 0000000000000000 R14: dffffc0000000000 R15: ffffffff81c264f0 FS: 00007f27217f66c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2e80dff8 CR3: 00000000268f8000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tracepoint_probe_register_prio+0xc0/0x110 kernel/tracepoint.c:464 register_trace_prio_sched_switch include/trace/events/sched.h:222 [inline] register_pid_events kernel/trace/trace_events.c:2354 [inline] event_pid_write.isra.0+0x439/0x7a0 kernel/trace/trace_events.c:2425 vfs_write+0x24c/0x1150 fs/read_write.c:677 ksys_write+0x12b/0x250 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f We can reproduce the warning by following the steps below: 1. echo 8 >> set_event_notrace_pid. Let tr->filtered_pids owns one pid and register sched_switch tracepoint. 2. echo ' ' >> set_event_pid, and perform fault injection during chunk allocation of trace_pid_list_alloc. Let pid_list with no pid and assign to tr->filtered_pids. 3. echo ' ' >> set_event_pid. Let pid_list is NULL and assign to tr->filtered_pids. 4. echo 9 >> set_event_pid, will trigger the double register sched_switch tracepoint warning. The reason is that syzkaller injects a fault into the chunk allocation in trace_pid_list_alloc, causing a failure in trace_pid_list_set, which may trigger double register of the same tracepoint. This only occurs when the system is about to crash, but to suppress this warning, let's add failure handling logic to trace_pid_list_set. Link: https://lore.kernel.org/20250908024658.2390398-1-pulehui@huaweicloud.com Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: syzbot+161412ccaeff20ce4dde@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67cb890e.050a0220.d8275.022e.GAE@google.com Signed-off-by: Pu Lehui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2f1ae6c0ee81..b3c94fbaf002 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { From b816265396daf1beb915e0ffbfd7f3906c2bf4a4 Mon Sep 17 00:00:00 2001 From: Klaus Kudielka Date: Sun, 7 Sep 2025 12:21:46 +0200 Subject: [PATCH 158/450] PCI: mvebu: Fix use of for_each_of_range() iterator 5da3d94a23c6 ("PCI: mvebu: Use for_each_of_range() iterator for parsing "ranges"") simplified code by using the for_each_of_range() iterator, but it broke PCI enumeration on Turris Omnia (and probably other mvebu targets). Issue #1: To determine range.flags, of_pci_range_parser_one() uses bus->get_flags(), which resolves to of_bus_pci_get_flags(), which already returns an IORESOURCE bit field, and NOT the original flags from the "ranges" resource. Then mvebu_get_tgt_attr() attempts the very same conversion again. Remove the misinterpretation of range.flags in mvebu_get_tgt_attr(), to restore the intended behavior. Issue #2: The driver needs target and attributes, which are encoded in the raw address values of the "/soc/pcie/ranges" resource. According to of_pci_range_parser_one(), the raw values are stored in range.bus_addr and range.parent_bus_addr, respectively. range.cpu_addr is a translated version of range.parent_bus_addr, and not relevant here. Use the correct range structure member, to extract target and attributes. This restores the intended behavior. Fixes: 5da3d94a23c6 ("PCI: mvebu: Use for_each_of_range() iterator for parsing "ranges"") Reported-by: Jan Palus Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220479 Signed-off-by: Klaus Kudielka Signed-off-by: Bjorn Helgaas Tested-by: Tony Dinh Tested-by: Jan Palus Link: https://patch.msgid.link/20250907102303.29735-1-klaus.kudielka@gmail.com --- drivers/pci/controller/pci-mvebu.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/pci/controller/pci-mvebu.c b/drivers/pci/controller/pci-mvebu.c index 755651f33811..a72aa57591c0 100644 --- a/drivers/pci/controller/pci-mvebu.c +++ b/drivers/pci/controller/pci-mvebu.c @@ -1168,12 +1168,6 @@ static void __iomem *mvebu_pcie_map_registers(struct platform_device *pdev, return devm_ioremap_resource(&pdev->dev, &port->regs); } -#define DT_FLAGS_TO_TYPE(flags) (((flags) >> 24) & 0x03) -#define DT_TYPE_IO 0x1 -#define DT_TYPE_MEM32 0x2 -#define DT_CPUADDR_TO_TARGET(cpuaddr) (((cpuaddr) >> 56) & 0xFF) -#define DT_CPUADDR_TO_ATTR(cpuaddr) (((cpuaddr) >> 48) & 0xFF) - static int mvebu_get_tgt_attr(struct device_node *np, int devfn, unsigned long type, unsigned int *tgt, @@ -1189,19 +1183,12 @@ static int mvebu_get_tgt_attr(struct device_node *np, int devfn, return -EINVAL; for_each_of_range(&parser, &range) { - unsigned long rtype; u32 slot = upper_32_bits(range.bus_addr); - if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_IO) - rtype = IORESOURCE_IO; - else if (DT_FLAGS_TO_TYPE(range.flags) == DT_TYPE_MEM32) - rtype = IORESOURCE_MEM; - else - continue; - - if (slot == PCI_SLOT(devfn) && type == rtype) { - *tgt = DT_CPUADDR_TO_TARGET(range.cpu_addr); - *attr = DT_CPUADDR_TO_ATTR(range.cpu_addr); + if (slot == PCI_SLOT(devfn) && + type == (range.flags & IORESOURCE_TYPE_BITS)) { + *tgt = (range.parent_bus_addr >> 56) & 0xFF; + *attr = (range.parent_bus_addr >> 48) & 0xFF; return 0; } } From 1dbfb0363224f6da56f6655d596dc5097308d6f5 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 5 Sep 2025 06:57:27 -0700 Subject: [PATCH 159/450] genetlink: fix genl_bind() invoking bind() after -EPERM Per family bind/unbind callbacks were introduced to allow families to track multicast group consumer presence, e.g. to start or stop producing events depending on listeners. However, in genl_bind() the bind() callback was invoked even if capability checks failed and ret was set to -EPERM. This means that callbacks could run on behalf of unauthorized callers while the syscall still returned failure to user space. Fix this by only invoking bind() after "if (ret) break;" check i.e. after permission checks have succeeded. Fixes: 3de21a8990d3 ("genetlink: Add per family bind/unbind callbacks") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250905135731.3026965-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d34543..978c129c6095 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); From 674b34c4c770551e916ae707829c7faea4782d3a Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 5 Sep 2025 14:45:07 +0200 Subject: [PATCH 160/450] net: dsa: b53: fix ageing time for BCM53101 For some reason Broadcom decided that BCM53101 uses 0.5s increments for the ageing time register, but kept the field width the same [1]. Due to this, the actual ageing time was always half of what was configured. Fix this by adapting the limits and value calculation for BCM53101. So far it looks like this is the only chip with the increased tick speed: $ grep -l -r "Specifies the aging time in 0.5 seconds" cdk/PKG/chip | sort cdk/PKG/chip/bcm53101/bcm53101_a0_defs.h $ grep -l -r "Specifies the aging time in seconds" cdk/PKG/chip | sort cdk/PKG/chip/bcm53010/bcm53010_a0_defs.h cdk/PKG/chip/bcm53020/bcm53020_a0_defs.h cdk/PKG/chip/bcm53084/bcm53084_a0_defs.h cdk/PKG/chip/bcm53115/bcm53115_a0_defs.h cdk/PKG/chip/bcm53118/bcm53118_a0_defs.h cdk/PKG/chip/bcm53125/bcm53125_a0_defs.h cdk/PKG/chip/bcm53128/bcm53128_a0_defs.h cdk/PKG/chip/bcm53134/bcm53134_a0_defs.h cdk/PKG/chip/bcm53242/bcm53242_a0_defs.h cdk/PKG/chip/bcm53262/bcm53262_a0_defs.h cdk/PKG/chip/bcm53280/bcm53280_a0_defs.h cdk/PKG/chip/bcm53280/bcm53280_b0_defs.h cdk/PKG/chip/bcm53600/bcm53600_a0_defs.h cdk/PKG/chip/bcm89500/bcm89500_a0_defs.h [1] https://github.com/Broadcom/OpenMDK/blob/a5d3fc9b12af3eeb68f2ca0ce7ec4056cd14d6c2/cdk/PKG/chip/bcm53101/bcm53101_a0_defs.h#L28966 Fixes: e39d14a760c0 ("net: dsa: b53: implement setting ageing time") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250905124507.59186-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 829b1f087e9e..2f846381d5a7 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1273,9 +1273,15 @@ static int b53_setup(struct dsa_switch *ds) */ ds->untag_vlan_aware_bridge_pvid = true; - /* Ageing time is set in seconds */ - ds->ageing_time_min = 1 * 1000; - ds->ageing_time_max = AGE_TIME_MAX * 1000; + if (dev->chip_id == BCM53101_DEVICE_ID) { + /* BCM53101 uses 0.5 second increments */ + ds->ageing_time_min = 1 * 500; + ds->ageing_time_max = AGE_TIME_MAX * 500; + } else { + /* Everything else uses 1 second increments */ + ds->ageing_time_min = 1 * 1000; + ds->ageing_time_max = AGE_TIME_MAX * 1000; + } ret = b53_reset_switch(dev); if (ret) { @@ -2559,7 +2565,10 @@ int b53_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) else reg = B53_AGING_TIME_CONTROL; - atc = DIV_ROUND_CLOSEST(msecs, 1000); + if (dev->chip_id == BCM53101_DEVICE_ID) + atc = DIV_ROUND_CLOSEST(msecs, 500); + else + atc = DIV_ROUND_CLOSEST(msecs, 1000); if (!is5325(dev) && !is5365(dev)) atc |= AGE_CHANGE; From 8625f5748fea960d2af4f3c3e9891ee8f6f80906 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 5 Sep 2025 13:12:33 +0200 Subject: [PATCH 161/450] net: bridge: Bounce invalid boolopts The bridge driver currently tolerates options that it does not recognize. Instead, it should bounce them. Fixes: a428afe82f98 ("net: bridge: add support for user-controlled bool options") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e6fdca3b5a8d54183fbda075daffef38bdd7ddce.1757070067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f0..c683baa3847f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); From d3b28612bc5500133260aaf36794a0a0c287d61b Mon Sep 17 00:00:00 2001 From: Jonas Rebmann Date: Fri, 5 Sep 2025 14:20:50 +0200 Subject: [PATCH 162/450] net: phy: NXP_TJA11XX: Update Kconfig with TJA1102 support Update the Kconfig description to indicate support for the TJA1102. Signed-off-by: Jonas Rebmann Link: https://patch.msgid.link/20250905-tja1102-kconfig-v1-1-a57e6ac4e264@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 28acc6392cfc..392749aae54d 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -361,7 +361,7 @@ config NXP_TJA11XX_PHY tristate "NXP TJA11xx PHYs support" depends on HWMON help - Currently supports the NXP TJA1100 and TJA1101 PHY. + Currently supports the NXP TJA1100, TJA1101 and TJA1102 PHYs. config NCN26000_PHY tristate "Onsemi 10BASE-T1S Ethernet PHY" From 7989fdce69ec0a928e136477da2aa208a191fba2 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Fri, 22 Aug 2025 15:55:16 -0700 Subject: [PATCH 163/450] percpu: fix race on alloc failed warning limit The 'allocation failed, ...' warning messages can cause unlimited log spam, contrary to the implementation's intent. The warn_limit variable is accessed without synchronization. If more than threads enter the warning path at the same time, the variable will get decremented past 0. Once it becomes negative, the non-zero check will always return true leading to unlimited log spam. Use atomic operation to access warn_limit and change condition to test for non-negative (>= 0) - atomic_dec_if_positive will return -1 once warn_limit becomes 0. Continue to print disable message alongside the last warning. While the change cited in Fixes is only adjacent, the warning limit implementation was correct before it. Only non-atomic allocations were considered for warnings, and those happened to hold pcpu_alloc_mutex while accessing warn_limit. [vdumitrescu@nvidia.com: prevent warn_limit from going negative, per Christoph Lameter] Link: https://lkml.kernel.org/r/ee87cc59-2717-4dbb-8052-1d2692c5aaaa@nvidia.com Link: https://lkml.kernel.org/r/ab22061a-a62f-4429-945b-744e5cc4ba35@nvidia.com Fixes: f7d77dfc91f7 ("mm/percpu.c: print error message too if atomic alloc failed") Signed-off-by: Vlad Dumitrescu Reviewed-by: Baoquan He Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a56f35dcc417..81462ce5866e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { From 78d2d32f0b789d67cbe5cfea0c0714cb2446c37e Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 28 Aug 2025 14:26:56 +0000 Subject: [PATCH 164/450] mm/mremap: fix regression in vrm->new_addr check Commit 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") moved the sanity check for vrm->new_addr from mremap_to() to check_mremap_params(). However, this caused a regression as vrm->new_addr is now checked even when MREMAP_FIXED and MREMAP_DONTUNMAP flags are not specified. In this case, vrm->new_addr can be garbage and create unexpected failures. Fix this by moving the new_addr check after the vrm_implies_new_addr() guard. This ensures that the new_addr is only checked when the user has specified one explicitly. Link: https://lkml.kernel.org/r/20250828142657.770502-1-cmllamas@google.com Fixes: 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") Signed-off-by: Carlos Llamas Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Carlos Llamas Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mremap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e618a706aff5..35de0a7b910e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; From d613f53c83ec47089c4e25859d5e8e0359f6f8da Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Aug 2025 10:46:18 +0800 Subject: [PATCH 165/450] mm/memory-failure: fix VM_BUG_ON_PAGE(PagePoisoned(page)) when unpoison memory When I did memory failure tests, below panic occurs: page dumped because: VM_BUG_ON_PAGE(PagePoisoned(page)) kernel BUG at include/linux/page-flags.h:616! Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 720 Comm: bash Not tainted 6.10.0-rc1-00195-g148743902568 #40 RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Call Trace: unpoison_memory+0x2f3/0x590 simple_attr_write_xsigned.constprop.0.isra.0+0xb3/0x110 debugfs_attr_write+0x42/0x60 full_proxy_write+0x5b/0x80 vfs_write+0xd5/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f08f0314887 RSP: 002b:00007ffece710078 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f08f0314887 RDX: 0000000000000009 RSI: 0000564787a30410 RDI: 0000000000000001 RBP: 0000564787a30410 R08: 000000000000fefe R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009 R13: 00007f08f041b780 R14: 00007f08f0417600 R15: 00007f08f0416a00 Modules linked in: hwpoison_inject ---[ end trace 0000000000000000 ]--- RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x31c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The root cause is that unpoison_memory() tries to check the PG_HWPoison flags of an uninitialized page. So VM_BUG_ON_PAGE(PagePoisoned(page)) is triggered. This can be reproduced by below steps: 1.Offline memory block: echo offline > /sys/devices/system/memory/memory12/state 2.Get offlined memory pfn: page-types -b n -rlN 3.Write pfn to unpoison-pfn echo > /sys/kernel/debug/hwpoison/unpoison-pfn This scenario can be identified by pfn_to_online_page() returning NULL. And ZONE_DEVICE pages are never expected, so we can simply fail if pfn_to_online_page() == NULL to fix the bug. Link: https://lkml.kernel.org/r/20250828024618.1744895-1-linmiaohe@huawei.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 10b3c281c2ae..df6ee59527dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2568,10 +2568,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); From 04100f775c2ea501927f508f17ad824ad1f23c8d Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Fri, 29 Aug 2025 10:18:15 -0500 Subject: [PATCH 166/450] ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/extent_map.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db1..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; From 79357cd06d41d0f5a11b17d7c86176e395d10ef2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sun, 31 Aug 2025 14:10:58 +0200 Subject: [PATCH 167/450] mm/vmalloc, mm/kasan: respect gfp mask in kasan_populate_vmalloc() kasan_populate_vmalloc() and its helpers ignore the caller's gfp_mask and always allocate memory using the hardcoded GFP_KERNEL flag. This makes them inconsistent with vmalloc(), which was recently extended to support GFP_NOFS and GFP_NOIO allocations. Page table allocations performed during shadow population also ignore the external gfp_mask. To preserve the intended semantics of GFP_NOFS and GFP_NOIO, wrap the apply_to_page_range() calls into the appropriate memalloc scope. xfs calls vmalloc with GFP_NOFS, so this bug could lead to deadlock. There was a report here https://lkml.kernel.org/r/686ea951.050a0220.385921.0016.GAE@google.com This patch: - Extends kasan_populate_vmalloc() and helpers to take gfp_mask; - Passes gfp_mask down to alloc_pages_bulk() and __get_free_page(); - Enforces GFP_NOFS/NOIO semantics with memalloc_*_save()/restore() around apply_to_page_range(); - Updates vmalloc.c and percpu allocator call sites accordingly. Link: https://lkml.kernel.org/r/20250831121058.92971-1-urezki@gmail.com Fixes: 451769ebb7e7 ("mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: syzbot+3470c9ffee63e4abafeb@syzkaller.appspotmail.com Reviewed-by: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 6 +++--- mm/kasan/shadow.c | 31 ++++++++++++++++++++++++------- mm/vmalloc.c | 8 ++++---- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..fe5ce9215821 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e2ceebf737ef..11d472a5c4e8 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -336,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -354,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -386,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -415,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..5edd536ba9d2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2089,7 +2089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4826,7 +4826,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } From 3fac212fe489aa0dbe8d80a42a7809840ca7b0f9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 2 Sep 2025 15:49:26 -0700 Subject: [PATCH 168/450] compiler-clang.h: define __SANITIZE_*__ macros only when undefined Clang 22 recently added support for defining __SANITIZE__ macros similar to GCC [1], which causes warnings (or errors with CONFIG_WERROR=y or W=e) with the existing defines that the kernel creates to emulate this behavior with existing clang versions. In file included from :3: In file included from include/linux/compiler_types.h:171: include/linux/compiler-clang.h:37:9: error: '__SANITIZE_THREAD__' macro redefined [-Werror,-Wmacro-redefined] 37 | #define __SANITIZE_THREAD__ | ^ :352:9: note: previous definition is here 352 | #define __SANITIZE_THREAD__ 1 | ^ Refactor compiler-clang.h to only define the sanitizer macros when they are undefined and adjust the rest of the code to use these macros for checking if the sanitizers are enabled, clearing up the warnings and allowing the kernel to easily drop these defines when the minimum supported version of LLVM for building the kernel becomes 22.0.0 or newer. Link: https://lkml.kernel.org/r/20250902-clang-update-sanitize-defines-v1-1-cf3702ca3d92@kernel.org Link: https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c [1] Signed-off-by: Nathan Chancellor Reviewed-by: Justin Stitt Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Bill Wendling Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc7..8720a0705900 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -18,23 +18,42 @@ #define KASAN_ABI_VERSION 5 /* + * Clang 22 added preprocessor macros to match GCC, in hopes of eventually + * dropping __has_feature support for sanitizers: + * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c + * Create these macros for older versions of clang so that it is easy to clean + * up once the minimum supported version of LLVM for building the kernel always + * creates these macros. + * * Note: Checking __has_feature(*_sanitizer) is only true if the feature is * enabled. Therefore it is not required to additionally check defined(CONFIG_*) * to avoid adding redundant attributes in other configurations. */ - -#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ +#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__) #define __SANITIZE_ADDRESS__ +#endif +#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__) +#define __SANITIZE_HWADDRESS__ +#endif +#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__) +#define __SANITIZE_THREAD__ +#endif + +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel. + */ +#ifdef __SANITIZE_HWADDRESS__ +#define __SANITIZE_ADDRESS__ +#endif + +#ifdef __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) #else #define __no_sanitize_address #endif -#if __has_feature(thread_sanitizer) -/* emulate gcc's __SANITIZE_THREAD__ flag */ -#define __SANITIZE_THREAD__ +#ifdef __SANITIZE_THREAD__ #define __no_sanitize_thread \ __attribute__((no_sanitize("thread"))) #else From 0ce9398aa0830f15f92bbed73853f9861c3e74ff Mon Sep 17 00:00:00 2001 From: wangzijie Date: Thu, 4 Sep 2025 21:57:15 +0800 Subject: [PATCH 169/450] proc: fix type confusion in pde_set_flags() Commit 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") missed a key part in the definition of proc_dir_entry: union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; So dereference of ->proc_ops assumes it is a proc_ops structure results in type confusion and make NULL check for 'proc_ops' not work for proc dir. Add !S_ISDIR(dp->mode) test before calling pde_set_flags() to fix it. Link: https://lkml.kernel.org/r/20250904135715.3972782-1-wangzijie1@honor.com Fixes: 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") Signed-off-by: wangzijie Reported-by: Brad Spengler Closes: https://lore.kernel.org/all/20250903065758.3678537-1-wangzijie1@honor.com/ Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Jiri Slaby Cc: Stefano Brivio Cc: Signed-off-by: Andrew Morton --- fs/proc/generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bd0c099cfdd2..176281112273 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -393,7 +393,8 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; - pde_set_flags(dp); + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); write_lock(&proc_subdir_lock); dp->parent = dir; From 3260a3f0828e06f5f13fac69fb1999a6d60d9cff Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Fri, 5 Sep 2025 13:10:46 +0300 Subject: [PATCH 170/450] mm/damon/sysfs: fix use-after-free in state_show() state_show() reads kdamond->damon_ctx without holding damon_sysfs_lock. This allows a use-after-free race: CPU 0 CPU 1 ----- ----- state_show() damon_sysfs_turn_damon_on() ctx = kdamond->damon_ctx; mutex_lock(&damon_sysfs_lock); damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; mutex_unlock(&damon_sysfs_lock); damon_is_running(ctx); /* ctx is freed */ mutex_lock(&ctx->kdamond_lock); /* UAF */ (The race can also occur with damon_sysfs_kdamonds_rm_dirs() and damon_sysfs_kdamond_release(), which free or replace the context under damon_sysfs_lock.) Fix by taking damon_sysfs_lock before dereferencing the context, mirroring the locking used in pid_show(). The bug has existed since state_show() first accessed kdamond->damon_ctx. Link: https://lkml.kernel.org/r/20250905101046.2288-1-disclosure@aisle.com Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring") Signed-off-by: Stanislav Fort Reported-by: Stanislav Fort Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cb..7b9254cadd5f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); From a68172d95c2845d2b5455b072b4ff51ba32650e9 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 5 Sep 2025 12:15:57 +0300 Subject: [PATCH 171/450] MAINTAINERS: add tree entry to numa memblocks and emulation block Link: https://lkml.kernel.org/r/20250905091557.3529937-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Vlastimil Babka Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Cc: David Hildenbrand Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..fbdbf7c012a0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16127,6 +16127,7 @@ M: Andrew Morton M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git F: include/linux/numa_memblks.h F: mm/numa.c F: mm/numa_emulation.c From cfa7b7659757f8d0fc4914429efa90d0d2577dd7 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 5 Sep 2025 13:41:49 +0300 Subject: [PATCH 172/450] drm/i915/power: fix size for for_each_set_bit() in abox iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for_each_set_bit() expects size to be in bits, not bytes. The abox mask iteration uses bytes, but it works by coincidence, because the local variable holding the mask is unsigned long, and the mask only ever has bit 2 as the highest bit. Using a smaller type could lead to subtle and very hard to track bugs. Fixes: 62afef2811e4 ("drm/i915/rkl: RKL uses ABOX0 for pixel transfers") Cc: Ville Syrjälä Cc: Matt Roper Cc: stable@vger.kernel.org # v5.9+ Reviewed-by: Matt Roper Link: https://lore.kernel.org/r/20250905104149.1144751-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit 7ea3baa6efe4bb93d11e1c0e6528b1468d7debf6) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_display_power.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_power.c b/drivers/gpu/drm/i915/display/intel_display_power.c index 273054c22325..c92f3e736228 100644 --- a/drivers/gpu/drm/i915/display/intel_display_power.c +++ b/drivers/gpu/drm/i915/display/intel_display_power.c @@ -1172,7 +1172,7 @@ static void icl_mbus_init(struct intel_display *display) if (DISPLAY_VER(display) == 12) abox_regs |= BIT(0); - for_each_set_bit(i, &abox_regs, sizeof(abox_regs)) + for_each_set_bit(i, &abox_regs, BITS_PER_TYPE(abox_regs)) intel_de_rmw(display, MBUS_ABOX_CTL(i), mask, val); } @@ -1629,11 +1629,11 @@ static void tgl_bw_buddy_init(struct intel_display *display) if (table[config].page_mask == 0) { drm_dbg_kms(display->drm, "Unknown memory configuration; disabling address buddy logic.\n"); - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) intel_de_write(display, BW_BUDDY_CTL(i), BW_BUDDY_DISABLE); } else { - for_each_set_bit(i, &abox_mask, sizeof(abox_mask)) { + for_each_set_bit(i, &abox_mask, BITS_PER_TYPE(abox_mask)) { intel_de_write(display, BW_BUDDY_PAGE_MASK(i), table[config].page_mask); From de4da7bd5c5172f6362b5994f541d830119840dc Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 21 Aug 2025 17:23:09 +0200 Subject: [PATCH 173/450] KVM: s390: Fix access to unavailable adapter indicator pages during postcopy When you run a KVM guest with vhost-net and migrate that guest to another host, and you immediately enable postcopy after starting the migration, there is a big chance that the network connection of the guest won't work anymore on the destination side after the migration. With a debug kernel v6.16.0, there is also a call trace that looks like this: FAULT_FLAG_ALLOW_RETRY missing 881 CPU: 6 UID: 0 PID: 549 Comm: kworker/6:2 Kdump: loaded Not tainted 6.16.0 #56 NONE Hardware name: IBM 3931 LA1 400 (LPAR) Workqueue: events irqfd_inject [kvm] Call Trace: [<00003173cbecc634>] dump_stack_lvl+0x104/0x168 [<00003173cca69588>] handle_userfault+0xde8/0x1310 [<00003173cc756f0c>] handle_pte_fault+0x4fc/0x760 [<00003173cc759212>] __handle_mm_fault+0x452/0xa00 [<00003173cc7599ba>] handle_mm_fault+0x1fa/0x6a0 [<00003173cc73409a>] __get_user_pages+0x4aa/0xba0 [<00003173cc7349e8>] get_user_pages_remote+0x258/0x770 [<000031734be6f052>] get_map_page+0xe2/0x190 [kvm] [<000031734be6f910>] adapter_indicators_set+0x50/0x4a0 [kvm] [<000031734be7f674>] set_adapter_int+0xc4/0x170 [kvm] [<000031734be2f268>] kvm_set_irq+0x228/0x3f0 [kvm] [<000031734be27000>] irqfd_inject+0xd0/0x150 [kvm] [<00003173cc00c9ec>] process_one_work+0x87c/0x1490 [<00003173cc00dda6>] worker_thread+0x7a6/0x1010 [<00003173cc02dc36>] kthread+0x3b6/0x710 [<00003173cbed2f0c>] __ret_from_fork+0xdc/0x7f0 [<00003173cdd737ca>] ret_from_fork+0xa/0x30 3 locks held by kworker/6:2/549: #0: 00000000800bc958 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ee/0x1490 #1: 000030f3d527fbd0 ((work_completion)(&irqfd->inject)){+.+.}-{0:0}, at: process_one_work+0x81c/0x1490 #2: 00000000f99862b0 (&mm->mmap_lock){++++}-{3:3}, at: get_map_page+0xa8/0x190 [kvm] The "FAULT_FLAG_ALLOW_RETRY missing" indicates that handle_userfaultfd() saw a page fault request without ALLOW_RETRY flag set, hence userfaultfd cannot remotely resolve it (because the caller was asking for an immediate resolution, aka, FAULT_FLAG_NOWAIT, while remote faults can take time). With that, get_map_page() failed and the irq was lost. We should not be strictly in an atomic environment here and the worker should be sleepable (the call is done during an ioctl from userspace), so we can allow adapter_indicators_set() to just sleep waiting for the remote fault instead. Link: https://issues.redhat.com/browse/RHEL-42486 Signed-off-by: Peter Xu [thuth: Assembled patch description and fixed some cosmetical issues] Signed-off-by: Thomas Huth Reviewed-by: Claudio Imbrenda Acked-by: Janosch Frank Fixes: f65470661f36 ("KVM: s390/interrupt: do not pin adapter interrupt pages") [frankja: Added fixes tag] Signed-off-by: Janosch Frank --- arch/s390/kvm/interrupt.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 2a92a8b9e4c2..9384572ffa7b 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2778,12 +2778,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { + struct mm_struct *mm = kvm->mm; struct page *page = NULL; + int locked = 1; + + if (mmget_not_zero(mm)) { + mmap_read_lock(mm); + get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE, + &page, &locked); + if (locked) + mmap_read_unlock(mm); + mmput(mm); + } - mmap_read_lock(kvm->mm); - get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, - &page, NULL); - mmap_read_unlock(kvm->mm); return page; } From 185d903064f8680c2de43514c94fddc0d75ed00a Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 25 Aug 2025 17:18:30 +0200 Subject: [PATCH 174/450] KVM: s390: Fix incorrect usage of mmu_notifier_register() If mmu_notifier_register() fails, for example because a signal was pending, the mmu_notifier will not be registered. But when the VM gets destroyed, it will get unregistered anyway and that will cause one extra mmdrop(), which will eventually cause the mm of the process to be freed too early, and cause a use-after free. This bug happens rarely, and only when secure guests are involved. The solution is to check the return value of mmu_notifier_register() and return it to the caller (ultimately it will be propagated all the way to userspace). In case of -EINTR, userspace will try again. Fixes: ca2fd0609b5d ("KVM: s390: pv: add mmu_notifier") Signed-off-by: Claudio Imbrenda Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/pv.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 25ede8354514..6ba5a0305e25 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -624,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) int cc, ret; u16 dummy; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + /* The notifier will be unregistered when the VM is destroyed */ + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + if (ret) { + kvm->arch.pv.mmu_notifier.ops = NULL; + return ret; + } + } + ret = kvm_s390_pv_alloc_vm(kvm); if (ret) return ret; @@ -659,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; - /* Add the notifier only once. No races because we hold kvm->lock */ - if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { - kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; - mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); - } return 0; } From 5f9df945d4e862979b50e4ecaba3dc81fb06e8ed Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Mon, 25 Aug 2025 17:18:31 +0200 Subject: [PATCH 175/450] KVM: s390: Fix FOLL_*/FAULT_FLAG_* confusion Pass the right type of flag to vcpu_dat_fault_handler(); it expects a FOLL_* flag (in particular FOLL_WRITE), but FAULT_FLAG_WRITE is passed instead. This still works because they happen to have the same integer value, but it's a mistake, thus the fix. Signed-off-by: Claudio Imbrenda Fixes: 05066cafa925 ("s390/mm/fault: Handle guest-related program interrupts in KVM") Acked-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bf6fa8b9ca73..6d51aa5f66be 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4864,12 +4864,12 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) * @vcpu: the vCPU whose gmap is to be fixed up * @gfn: the guest frame number used for memslots (including fake memslots) * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @flags: FOLL_* flags + * @foll: FOLL_* flags * * Return: 0 on success, < 0 in case of error. * Context: The mm lock must not be held before calling. May sleep. */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) { struct kvm_memory_slot *slot; unsigned int fault_flags; @@ -4883,13 +4883,13 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return vcpu_post_run_addressing_exception(vcpu); - fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; if (vcpu->arch.gmap->pfault_enabled) - flags |= FOLL_NOWAIT; + foll |= FOLL_NOWAIT; vmaddr = __gfn_to_hva_memslot(slot, gfn); try_again: - pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); /* Access outside memory, inject addressing exception */ if (is_noslot_pfn(pfn)) @@ -4905,7 +4905,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return 0; vcpu->stat.pfault_sync++; /* Could not setup async pfault, try again synchronously */ - flags &= ~FOLL_NOWAIT; + foll &= ~FOLL_NOWAIT; goto try_again; } /* Any other error */ @@ -4925,7 +4925,7 @@ int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, u return rc; } -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) { unsigned long gaddr_tmp; gfn_t gfn; @@ -4950,18 +4950,18 @@ static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, un } gfn = gpa_to_gfn(gaddr_tmp); } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { - unsigned int flags = 0; + unsigned int foll = 0; unsigned long gaddr; int rc; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) { case 0: @@ -5003,7 +5003,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) send_sig(SIGSEGV, current, 0); if (rc != -ENXIO) break; - flags = FAULT_FLAG_WRITE; + foll = FOLL_WRITE; fallthrough; case PGM_PROTECTION: case PGM_SEGMENT_TRANSLATION: @@ -5013,7 +5013,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - return vcpu_dat_fault_handler(vcpu, gaddr, flags); + return vcpu_dat_fault_handler(vcpu, gaddr, foll); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); From bf59028ea8d42e8d10bb3d847c9982488ee9e3a0 Mon Sep 17 00:00:00 2001 From: Oscar Maes Date: Tue, 2 Sep 2025 17:02:40 +0200 Subject: [PATCH 176/450] selftests: net: add test for destination in broadcast packets Add test to check the broadcast ethernet destination field is set correctly. This test sends a broadcast ping, captures it using tcpdump and ensures that all bits of the 6 octet ethernet destination address are correctly set by examining the output capture file. Co-developed-by: Brett A C Sheffield Signed-off-by: Brett A C Sheffield Signed-off-by: Oscar Maes Link: https://patch.msgid.link/20250902150240.4272-1-oscmaes92@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/broadcast_ether_dst.sh | 83 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100755 tools/testing/selftests/net/broadcast_ether_dst.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index c7e03e1d6f63..2b31d4a93ad7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -116,6 +116,7 @@ TEST_PROGS += skf_net_off.sh TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh +TEST_PROGS += broadcast_ether_dst.sh TEST_PROGS += broadcast_pmtu.sh TEST_PROGS += ipv6_force_forwarding.sh diff --git a/tools/testing/selftests/net/broadcast_ether_dst.sh b/tools/testing/selftests/net/broadcast_ether_dst.sh new file mode 100755 index 000000000000..334a7eca8a80 --- /dev/null +++ b/tools/testing/selftests/net/broadcast_ether_dst.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Author: Brett A C Sheffield +# Author: Oscar Maes +# +# Ensure destination ethernet field is correctly set for +# broadcast packets + +source lib.sh + +CLIENT_IP4="192.168.0.1" +GW_IP4="192.168.0.2" + +setup() { + setup_ns CLIENT_NS SERVER_NS + + ip -net "${SERVER_NS}" link add link1 type veth \ + peer name link0 netns "${CLIENT_NS}" + + ip -net "${CLIENT_NS}" link set link0 up + ip -net "${CLIENT_NS}" addr add "${CLIENT_IP4}"/24 dev link0 + + ip -net "${SERVER_NS}" link set link1 up + + ip -net "${CLIENT_NS}" route add default via "${GW_IP4}" + ip netns exec "${CLIENT_NS}" arp -s "${GW_IP4}" 00:11:22:33:44:55 +} + +cleanup() { + rm -f "${CAPFILE}" "${OUTPUT}" + ip -net "${SERVER_NS}" link del link1 + cleanup_ns "${CLIENT_NS}" "${SERVER_NS}" +} + +test_broadcast_ether_dst() { + local rc=0 + CAPFILE=$(mktemp -u cap.XXXXXXXXXX) + OUTPUT=$(mktemp -u out.XXXXXXXXXX) + + echo "Testing ethernet broadcast destination" + + # start tcpdump listening for icmp + # tcpdump will exit after receiving a single packet + # timeout will kill tcpdump if it is still running after 2s + timeout 2s ip netns exec "${CLIENT_NS}" \ + tcpdump -i link0 -c 1 -w "${CAPFILE}" icmp &> "${OUTPUT}" & + pid=$! + slowwait 1 grep -qs "listening" "${OUTPUT}" + + # send broadcast ping + ip netns exec "${CLIENT_NS}" \ + ping -W0.01 -c1 -b 255.255.255.255 &> /dev/null + + # wait for tcpdump for exit after receiving packet + wait "${pid}" + + # compare ethernet destination field to ff:ff:ff:ff:ff:ff + ether_dst=$(tcpdump -r "${CAPFILE}" -tnne 2>/dev/null | \ + awk '{sub(/,/,"",$3); print $3}') + if [[ "${ether_dst}" == "ff:ff:ff:ff:ff:ff" ]]; then + echo "[ OK ]" + rc="${ksft_pass}" + else + echo "[FAIL] expected dst ether addr to be ff:ff:ff:ff:ff:ff," \ + "got ${ether_dst}" + rc="${ksft_fail}" + fi + + return "${rc}" +} + +if [ ! -x "$(command -v tcpdump)" ]; then + echo "SKIP: Could not run test without tcpdump tool" + exit "${ksft_skip}" +fi + +trap cleanup EXIT + +setup +test_broadcast_ether_dst + +exit $? From d2e1b84c5141ff2ad465279acfc3cf943c960b78 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 8 Sep 2025 15:15:51 -0700 Subject: [PATCH 177/450] fs/resctrl: Eliminate false positive lockdep warning when reading SNC counters Running resctrl_tests on an SNC-2 system with lockdep debugging enabled triggers several warnings with following trace: WARNING: CPU: 0 PID: 1914 at kernel/cpu.c:528 lockdep_assert_cpus_held ... Call Trace: __mon_event_count ? __lock_acquire ? __pfx___mon_event_count mon_event_count ? __pfx_smp_mon_event_count smp_mon_event_count smp_call_on_cpu_callback get_cpu_cacheinfo_level() called from __mon_event_count() requires CPU hotplug lock to be held. The hotplug lock is indeed held during this time, as confirmed by the lockdep_assert_cpus_held() within mon_event_read() that calls mon_event_count() via IPI, but the lockdep tracking is not able to follow the IPI. Fresh CPU cache information via get_cpu_cacheinfo_level() from __mon_event_count() was added to support the fix for the issue where resctrl inappropriately maintained links to L3 cache information that will be stale in the case when the associated CPU goes offline. Keep the cacheinfo ID in struct rdt_mon_domain to ensure that resctrl does not maintain stale cache information while CPUs can go offline. Return to using a pointer to the L3 cache information (struct cacheinfo) in struct rmid_read, rmid_read::ci. Initialize rmid_read::ci before the IPI where it is used. CPU hotplug lock is held across rmid_read::ci initialization and use to ensure that it points to accurate cache information. Fixes: 594902c986e2 ("x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem") Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov (AMD) --- fs/resctrl/ctrlmondata.c | 2 +- fs/resctrl/internal.h | 4 ++-- fs/resctrl/monitor.c | 6 ++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index d98e0d2de09f..3c39cfacb251 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -625,11 +625,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) */ list_for_each_entry(d, &r->mon_domains, hdr.list) { if (d->ci_id == domid) { - rr.ci_id = d->ci_id; cpu = cpumask_any(&d->hdr.cpu_mask); ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) continue; + rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, &ci->shared_cpu_map, evtid, false); goto checkresult; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 0a1eedba2b03..9a8cf6f11151 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - unsigned int ci_id; + struct cacheinfo *ci; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index f5637855c3ac..7326c28a7908 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,7 +361,6 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; - struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -389,8 +388,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!ci || ci->id != rr->ci_id) + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) return -EINVAL; /* @@ -402,7 +400,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci_id != rr->ci_id) + if (d->ci_id != rr->ci->id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); From 2682e7a317504a9d81cbb397249d4299e84dfadd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2025 12:17:34 +0300 Subject: [PATCH 178/450] wifi: iwlwifi: fix 130/1030 configs The 130/1030 devices are really derivatives of 6030, with some small differences not pertaining to the MAC, so they must use the 6030 MAC config. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220472 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220517 Fixes: 35ac275ebe0c ("wifi: iwlwifi: cfg: finish config split") Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250909121728.8e4911f12528.I3aa7194012a4b584fbd5ddaa3a77e483280f1de4@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index f9e2095d6490..7e56e4ff7642 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -124,13 +124,13 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x0082, 0x1304, iwl6005_mac_cfg)},/* low 5GHz active */ {IWL_PCI_DEVICE(0x0082, 0x1305, iwl6005_mac_cfg)},/* high 5GHz active */ -/* 6x30 Series */ - {IWL_PCI_DEVICE(0x008A, 0x5305, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5307, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5325, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008A, 0x5327, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5315, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x008B, 0x5317, iwl1000_mac_cfg)}, +/* 1030/6x30 Series */ + {IWL_PCI_DEVICE(0x008A, 0x5305, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5307, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5325, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008A, 0x5327, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5315, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x008B, 0x5317, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5211, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5215, iwl6030_mac_cfg)}, {IWL_PCI_DEVICE(0x0090, 0x5216, iwl6030_mac_cfg)}, @@ -181,12 +181,12 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct pci_device_id iwl_hw_card_ids[] = { {IWL_PCI_DEVICE(0x08AE, 0x1027, iwl1000_mac_cfg)}, /* 130 Series WiFi */ - {IWL_PCI_DEVICE(0x0896, 0x5005, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5007, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5015, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0897, 0x5017, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5025, iwl1000_mac_cfg)}, - {IWL_PCI_DEVICE(0x0896, 0x5027, iwl1000_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5005, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5007, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5015, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0897, 0x5017, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5025, iwl6030_mac_cfg)}, + {IWL_PCI_DEVICE(0x0896, 0x5027, iwl6030_mac_cfg)}, /* 2x00 Series */ {IWL_PCI_DEVICE(0x0890, 0x4022, iwl2000_mac_cfg)}, From 15f519e9f883b316d86e2bb6b767a023aafd9d83 Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 12 Aug 2025 09:57:38 +0000 Subject: [PATCH 179/450] ceph: fix race condition validating r_parent before applying state Add validation to ensure the cached parent directory inode matches the directory info in MDS replies. This prevents client-side race conditions where concurrent operations (e.g. rename) cause r_parent to become stale between request initiation and reply processing, which could lead to applying state changes to incorrect directory inodes. [ idryomov: folded a kerneldoc fixup and a follow-up fix from Alex to move CEPH_CAP_PIN reference when r_parent is updated: When the parent directory lock is not held, req->r_parent can become stale and is updated to point to the correct inode. However, the associated CEPH_CAP_PIN reference was not being adjusted. The CEPH_CAP_PIN is a reference on an inode that is tracked for accounting purposes. Moving this pin is important to keep the accounting balanced. When the pin was not moved from the old parent to the new one, it created two problems: The reference on the old, stale parent was never released, causing a reference leak. A reference for the new parent was never acquired, creating the risk of a reference underflow later in ceph_mdsc_release_request(). This patch corrects the logic by releasing the pin from the old parent and acquiring it for the new parent when r_parent is switched. This ensures reference accounting stays balanced. ] Cc: stable@vger.kernel.org Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 14 ++-- fs/ceph/dir.c | 17 ++--- fs/ceph/file.c | 24 +++--- fs/ceph/inode.c | 7 +- fs/ceph/mds_client.c | 172 ++++++++++++++++++++++++++----------------- fs/ceph/mds_client.h | 18 ++++- 6 files changed, 145 insertions(+), 107 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fdd404fc8112..f3fe786b4143 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; - int pathlen = 0; - u64 pathbase; char *path; mutex_lock(&mdsc->mutex); @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, - &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8478e7e75df6..32973c62c1a2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* If op failed, mark everyone involved for errors */ if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); /* mark error on parent + clear complete */ mapping_set_error(req->r_parent->i_mapping, result); @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_old_inode->i_mapping, result); pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); } out: iput(req->r_old_inode); @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) int err = -EROFS; int op; char *path; - int pathlen; - u64 pathbase; if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c02f100f8552..978acd3d4b32 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; int mask = MAY_READ; @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file) if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ @@ -614,15 +613,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, mapping_set_error(req->r_parent->i_mapping, result); if (result) { - int pathlen = 0; - u64 base = 0; - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, - &base, 0); + struct ceph_path_info path_info = {0}; + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); pr_warn_client(cl, "async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + path_info.vino.ino, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path_info(&path_info); ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) @@ -791,8 +788,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, int mask; int err; char *path; - int pathlen; - u64 pathbase; doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", dir, ceph_vinop(dir), dentry, dentry, @@ -814,7 +809,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (!dn) { try_async = false; } else { - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0); if (IS_ERR(path)) { try_async = false; err = 0; @@ -826,7 +822,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, mask |= MAY_WRITE; err = ceph_mds_check_access(mdsc, path, mask); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dn); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827..8ac89ce6435c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2487,22 +2487,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, int truncate_retry = 20; /* The RMW will take around 50ms */ struct dentry *dentry; char *path; - int pathlen; - u64 pathbase; bool do_sync = false; dentry = d_find_alias(inode); if (!dentry) { do_sync = true; } else { - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0); + struct ceph_path_info path_info; + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0); if (IS_ERR(path)) { do_sync = true; err = 0; } else { err = ceph_mds_check_access(mdsc, path, MAY_WRITE); } - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); dput(dentry); /* For none EACCES cases will let the MDS do the mds auth check */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..3bc72b47fe4d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2681,8 +2681,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built - * @plen: returned length of string - * @pbase: returned base inode number + * @path_info: output path, length, base ino+snap, and freepath ownership flag * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called @@ -2700,7 +2699,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - int *plen, u64 *pbase, int for_wire) + struct ceph_path_info *path_info, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; @@ -2810,16 +2809,28 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); } - *pbase = base; - *plen = PATH_MAX - 1 - pos; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + + path_info->vino.ino = base; + path_info->pathlen = PATH_MAX - 1 - pos; + path_info->path = path + pos; + path_info->freepath = true; + + /* Set snap from dentry if available */ + if (d_inode(dentry)) + path_info->vino.snap = ceph_snap(d_inode(dentry)); + else + path_info->vino.snap = CEPH_NOSNAP; + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), - base, *plen, path + pos); + base, PATH_MAX - 1 - pos, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, - struct inode *dir, const char **ppath, int *ppathlen, - u64 *pino, bool *pfreepath, bool parent_locked) + struct inode *dir, struct ceph_path_info *path_info, + bool parent_locked) { char *path; @@ -2828,41 +2839,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { - *pino = ceph_ino(dir); + path_info->vino.ino = ceph_ino(dir); + path_info->vino.snap = ceph_snap(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + path_info->path = dentry->d_name.name; + path_info->pathlen = dentry->d_name.len; + path_info->freepath = false; return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap handling. + */ return 0; } -static int build_inode_path(struct inode *inode, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath) +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { - *pino = ceph_ino(inode); - *ppathlen = 0; + path_info->vino.ino = ceph_ino(inode); + path_info->vino.snap = ceph_snap(inode); + path_info->pathlen = 0; + path_info->freepath = false; return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); - *ppath = path; - *pfreepath = true; + /* + * ceph_mdsc_build_path already fills path_info, including snap from dentry. + * Override with inode's snap since that's what this function is for. + */ + path_info->vino.snap = ceph_snap(inode); return 0; } @@ -2872,26 +2889,32 @@ static int build_inode_path(struct inode *inode, */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, - const char *rpath, u64 rino, const char **ppath, - int *pathlen, u64 *ino, bool *freepath, + const char *rpath, u64 rino, + struct ceph_path_info *path_info, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; + /* Initialize the output structure */ + memset(path_info, 0, sizeof(*path_info)); + if (rinode) { - r = build_inode_path(rinode, ppath, pathlen, ino, freepath); + r = build_inode_path(rinode, path_info); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, - freepath, parent_locked); - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino, + path_info->pathlen, path_info->path); } else if (rpath || rino) { - *ino = rino; - *ppath = rpath; - *pathlen = rpath ? strlen(rpath) : 0; - doutc(cl, " path %.*s\n", *pathlen, rpath); + path_info->vino.ino = rino; + path_info->vino.snap = CEPH_NOSNAP; + path_info->path = rpath; + path_info->pathlen = rpath ? strlen(rpath) : 0; + path_info->freepath = false; + + doutc(cl, " path %.*s\n", path_info->pathlen, rpath); } return r; @@ -2968,11 +2991,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; - const char *path1 = NULL; - const char *path2 = NULL; - u64 ino1 = 0, ino2 = 0; - int pathlen1 = 0, pathlen2 = 0; - bool freepath1 = false, freepath2 = false; + struct ceph_path_info path_info1 = {0}; + struct ceph_path_info path_info2 = {0}; struct dentry *old_dentry = NULL; int len; u16 releases; @@ -2982,25 +3002,49 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, - req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1, - test_bit(CEPH_MDS_R_PARENT_LOCKED, - &req->r_req_flags)); + req->r_parent, req->r_path1, req->r_ino1.ino, + &path_info1, parent_locked); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* + * When the parent directory's i_rwsem is *not* locked, req->r_parent may + * have become stale (e.g. after a concurrent rename) between the time the + * dentry was looked up and now. If we detect that the stored r_parent + * does not match the inode number we just encoded for the request, switch + * to the correct inode so that the MDS receives a valid parent reference. + */ + if (!parent_locked && req->r_parent && path_info1.vino.ino && + ceph_ino(req->r_parent) != path_info1.vino.ino) { + struct inode *old_parent = req->r_parent; + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL); + if (!IS_ERR(correct_dir)) { + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n", + ceph_ino(old_parent), path_info1.vino.ino); + /* + * Transfer CEPH_CAP_PIN from the old parent to the new one. + * The pin was taken earlier in ceph_mdsc_submit_request(). + */ + ceph_put_cap_refs(ceph_inode(old_parent), CEPH_CAP_PIN); + iput(old_parent); + req->r_parent = correct_dir; + ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + } + } + /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, - req->r_old_dentry_dir, - req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2, true); + req->r_old_dentry_dir, + req->r_path2, req->r_ino2.ino, + &path_info2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; @@ -3031,7 +3075,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); - len += pathlen1 + pathlen2; + len += path_info1.pathlen + path_info2.pathlen; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -3039,9 +3083,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) - len += pathlen1; + len += path_info1.pathlen; if (req->r_old_dentry_drop) - len += pathlen2; + len += path_info2.pathlen; /* MClientRequest tail */ @@ -3154,8 +3198,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; - ceph_encode_filepath(&p, end, ino1, path1); - ceph_encode_filepath(&p, end, ino2, path2); + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path); + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; @@ -3218,11 +3262,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.data_off = cpu_to_le16(0); out_free2: - if (freepath2) - ceph_mdsc_free_path((char *)path2, pathlen2); + ceph_mdsc_free_path_info(&path_info2); out_free1: - if (freepath1) - ceph_mdsc_free_path((char *)path1, pathlen1); + ceph_mdsc_free_path_info(&path_info1); out: return msg; out_err: @@ -4579,24 +4621,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; - char *path; - int pathlen = 0, err; - u64 pathbase; + struct ceph_path_info path_info = {0}; + int err; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } - } else { - path = NULL; - pathbase = 0; } spin_lock(&ci->i_ceph_lock); @@ -4629,7 +4667,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -4644,7 +4682,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = cpu_to_le64(pathbase); + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino); } if (list_empty(&ci->i_cap_snaps)) { @@ -4706,7 +4744,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -4730,7 +4768,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -4741,17 +4779,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + - pathlen + sizeof(rec.v1)); + path_info.pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: - ceph_mdsc_free_path(path, pathlen); + ceph_mdsc_free_path_info(&path_info); if (!err) recon_state->nr_caps++; return err; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3e2a6fa7c19a..0428a5eaf28c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); -static inline void ceph_mdsc_free_path(char *path, int len) +/* + * Structure to group path-related output parameters for build_*_path functions + */ +struct ceph_path_info { + const char *path; + int pathlen; + struct ceph_vino vino; + bool freepath; +}; + +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { - if (!IS_ERR_OR_NULL(path)) - __putname(path - (PATH_MAX - 1 - len)); + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, - struct dentry *dentry, int *plen, u64 *base, + struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); From bec324f33d1ed346394b2eee25bf6dbf3511f727 Mon Sep 17 00:00:00 2001 From: Alex Markuze Date: Tue, 12 Aug 2025 09:57:39 +0000 Subject: [PATCH 180/450] ceph: fix race condition where r_parent becomes stale before sending message When the parent directory's i_rwsem is not locked, req->r_parent may become stale due to concurrent operations (e.g. rename) between dentry lookup and message creation. Validate that r_parent matches the encoded parent inode and update to the correct inode if a mismatch is detected. [ idryomov: folded a follow-up fix from Alex to drop extra reference from ceph_get_reply_dir() in ceph_fill_trace(): ceph_get_reply_dir() may return a different, referenced inode when r_parent is stale and the parent directory lock is not held. ceph_fill_trace() used that inode but failed to drop the reference when it differed from req->r_parent, leaking an inode reference. Keep the directory inode in a local variable and iput() it at function end if it does not match req->r_parent. ] Cc: stable@vger.kernel.org Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8ac89ce6435c..f67025465de0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -55,6 +55,52 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } +/* + * Check if the parent inode matches the vino from directory reply info + */ +static inline bool ceph_vino_matches_parent(struct inode *parent, + struct ceph_vino vino) +{ + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap; +} + +/* + * Validate that the directory inode referenced by @req->r_parent matches the + * inode number and snapshot id contained in the reply's directory record. If + * they do not match – which can theoretically happen if the parent dentry was + * moved between the time the request was issued and the reply arrived – fall + * back to looking up the correct inode in the inode cache. + * + * A reference is *always* returned. Callers that receive a different inode + * than the original @parent are responsible for dropping the extra reference + * once the reply has been processed. + */ +static struct inode *ceph_get_reply_dir(struct super_block *sb, + struct inode *parent, + struct ceph_mds_reply_info_parsed *rinfo) +{ + struct ceph_vino vino; + + if (unlikely(!rinfo->diri.in)) + return parent; /* nothing to compare against */ + + /* If we didn't have a cached parent inode to begin with, just bail out. */ + if (!parent) + return NULL; + + vino.ino = le64_to_cpu(rinfo->diri.in->ino); + vino.snap = le64_to_cpu(rinfo->diri.in->snapid); + + if (likely(ceph_vino_matches_parent(parent, vino))) + return parent; /* matches – use the original reference */ + + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */ + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n", + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap); + + return ceph_get_inode(sb, vino, NULL); +} + /** * ceph_new_inode - allocate a new inode in advance of an expected create * @dir: parent directory for new inode @@ -1523,6 +1569,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_vino tvino, dvino; struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_client *cl = fsc->client; + struct inode *parent_dir = NULL; int err = 0; doutc(cl, "%p is_dentry %d is_target %d\n", req, @@ -1536,10 +1583,17 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) } if (rinfo->head->is_dentry) { - struct inode *dir = req->r_parent; - - if (dir) { - err = ceph_fill_inode(dir, NULL, &rinfo->diri, + /* + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set, + * so we need to get the correct inode + */ + parent_dir = ceph_get_reply_dir(sb, req->r_parent, rinfo); + if (unlikely(IS_ERR(parent_dir))) { + err = PTR_ERR(parent_dir); + goto done; + } + if (parent_dir) { + err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri, rinfo->dirfrag, session, -1, &req->r_caps_reservation); if (err < 0) @@ -1548,14 +1602,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) WARN_ON_ONCE(1); } - if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && + if (parent_dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; struct fscrypt_str oname = FSTR_INIT(NULL, 0); - struct ceph_fname fname = { .dir = dir, + struct ceph_fname fname = { .dir = parent_dir, .name = rinfo->dname, .ctext = rinfo->altname, .name_len = rinfo->dname_len, @@ -1564,10 +1618,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); - parent = d_find_any_alias(dir); + parent = d_find_any_alias(parent_dir); BUG_ON(!parent); - err = ceph_fname_alloc_buffer(dir, &oname); + err = ceph_fname_alloc_buffer(parent_dir, &oname); if (err < 0) { dput(parent); goto done; @@ -1576,7 +1630,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); if (err < 0) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); goto done; } dname.name = oname.name; @@ -1595,7 +1649,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dname.len, dname.name, dn); if (!dn) { dput(parent); - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); err = -ENOMEM; goto done; } @@ -1610,12 +1664,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) ceph_snap(d_inode(dn)) != tvino.snap)) { doutc(cl, " dn %p points to wrong inode %p\n", dn, d_inode(dn)); - ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(parent_dir); d_delete(dn); dput(dn); goto retry_lookup; } - ceph_fname_free_buffer(dir, &oname); + ceph_fname_free_buffer(parent_dir, &oname); req->r_dentry = dn; dput(parent); @@ -1794,6 +1848,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) &dvino, ptvino); } done: + /* Drop extra ref from ceph_get_reply_dir() if it returned a new inode */ + if (unlikely(!IS_ERR_OR_NULL(parent_dir) && parent_dir != req->r_parent)) + iput(parent_dir); doutc(cl, "done err=%d\n", err); return err; } From cce7c15faaac79b532a07ed6ab8332280ad83762 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 27 Aug 2025 20:17:08 +0200 Subject: [PATCH 181/450] ceph: always call ceph_shift_unused_folios_left() The function ceph_process_folio_batch() sets folio_batch entries to NULL, which is an illegal state. Before folio_batch_release() crashes due to this API violation, the function ceph_shift_unused_folios_left() is supposed to remove those NULLs from the array. However, since commit ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method"), this shifting doesn't happen anymore because the "for" loop got moved to ceph_process_folio_batch(), and now the `i` variable that remains in ceph_writepages_start() doesn't get incremented anymore, making the shifting effectively unreachable much of the time. Later, commit 1551ec61dc55 ("ceph: introduce ceph_submit_write() method") added more preconditions for doing the shift, replacing the `i` check (with something that is still just as broken): - if ceph_process_folio_batch() fails, shifting never happens - if ceph_move_dirty_page_in_page_array() was never called (because ceph_process_folio_batch() has returned early for some of various reasons), shifting never happens - if `processed_in_fbatch` is zero (because ceph_process_folio_batch() has returned early for some of the reasons mentioned above or because ceph_move_dirty_page_in_page_array() has failed), shifting never happens Since those two commits, any problem in ceph_process_folio_batch() could crash the kernel, e.g. this way: BUG: kernel NULL pointer dereference, address: 0000000000000034 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: Oops: 0002 [#1] SMP NOPTI CPU: 172 UID: 0 PID: 2342707 Comm: kworker/u778:8 Not tainted 6.15.10-cm4all1-es #714 NONE Hardware name: Dell Inc. PowerEdge R7615/0G9DHV, BIOS 1.6.10 12/08/2023 Workqueue: writeback wb_workfn (flush-ceph-1) RIP: 0010:folios_put_refs+0x85/0x140 Code: 83 c5 01 39 e8 7e 76 48 63 c5 49 8b 5c c4 08 b8 01 00 00 00 4d 85 ed 74 05 41 8b 44 ad 00 48 8b 15 b0 > RSP: 0018:ffffb880af8db778 EFLAGS: 00010207 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000003 RDX: ffffe377cc3b0000 RSI: 0000000000000000 RDI: ffffb880af8db8c0 RBP: 0000000000000000 R08: 000000000000007d R09: 000000000102b86f R10: 0000000000000001 R11: 00000000000000ac R12: ffffb880af8db8c0 R13: 0000000000000000 R14: 0000000000000000 R15: ffff9bd262c97000 FS: 0000000000000000(0000) GS:ffff9c8efc303000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000034 CR3: 0000000160958004 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: ceph_writepages_start+0xeb9/0x1410 The crash can be reproduced easily by changing the ceph_check_page_before_write() return value to `-E2BIG`. (Interestingly, the crash happens only if `huge_zero_folio` has already been allocated; without `huge_zero_folio`, is_huge_zero_folio(NULL) returns true and folios_put_refs() skips NULL entries instead of dereferencing them. That makes reproducing the bug somewhat unreliable. See https://lore.kernel.org/20250826231626.218675-1-max.kellermann@ionos.com for a discussion of this detail.) My suggestion is to move the ceph_shift_unused_folios_left() to right after ceph_process_folio_batch() to ensure it always gets called to fix up the illegal folio_batch state. Cc: stable@vger.kernel.org Fixes: ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") Link: https://lore.kernel.org/ceph-devel/aK4v548CId5GIKG1@swift.blarg.de/ Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b202d789e93..8bc66b45dade 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1687,6 +1687,7 @@ static int ceph_writepages_start(struct address_space *mapping, process_folio_batch: rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); if (rc) goto release_folios; @@ -1695,8 +1696,6 @@ static int ceph_writepages_start(struct address_space *mapping, goto release_folios; if (ceph_wbc.processed_in_fbatch) { - ceph_shift_unused_folios_left(&ceph_wbc.fbatch); - if (folio_batch_count(&ceph_wbc.fbatch) == 0 && ceph_wbc.locked_pages < ceph_wbc.max_pages) { doutc(cl, "reached end fbatch, trying for more\n"); From 249e0a47cdb46bb9eae65511c569044bd8698d7d Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 13:15:52 +0200 Subject: [PATCH 182/450] ceph: fix crash after fscrypt_encrypt_pagecache_blocks() error The function move_dirty_folio_in_page_array() was created by commit ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") by moving code from ceph_writepages_start() to this function. This new function is supposed to return an error code which is checked by the caller (now ceph_process_folio_batch()), and on error, the caller invokes redirty_page_for_writepage() and then breaks from the loop. However, the refactoring commit has gone wrong, and it by accident, it always returns 0 (= success) because it first NULLs the pointer and then returns PTR_ERR(NULL) which is always 0. This means errors are silently ignored, leaving NULL entries in the page array, which may later crash the kernel. The simple solution is to call PTR_ERR() before clearing the pointer. Cc: stable@vger.kernel.org Fixes: ce80b76dd327 ("ceph: introduce ceph_process_folio_batch() method") Link: https://lore.kernel.org/ceph-devel/aK4v548CId5GIKG1@swift.blarg.de/ Signed-off-by: Max Kellermann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8bc66b45dade..322ed268f14a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1264,7 +1264,9 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 0, gfp_flags); if (IS_ERR(pages[index])) { - if (PTR_ERR(pages[index]) == -EINVAL) { + int err = PTR_ERR(pages[index]); + + if (err == -EINVAL) { pr_err_client(cl, "inode->i_blkbits=%hhu\n", inode->i_blkbits); } @@ -1273,7 +1275,7 @@ static inline int move_dirty_folio_in_page_array(struct address_space *mapping, BUG_ON(ceph_wbc->locked_pages == 0); pages[index] = NULL; - return PTR_ERR(pages[index]); + return err; } } else { pages[index] = &folio->page; From e3c674db356c4303804b2415e7c2b11776cdd8c3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 4 Sep 2025 14:53:50 +0200 Subject: [PATCH 183/450] tunnels: reset the GSO metadata before reusing the skb If a GSO skb is sent through a Geneve tunnel and if Geneve options are added, the split GSO skb might not fit in the MTU anymore and an ICMP frag needed packet can be generated. In such case the ICMP packet might go through the segmentation logic (and dropped) later if it reaches a path were the GSO status is checked and segmentation is required. This is especially true when an OvS bridge is used with a Geneve tunnel attached to it. The following set of actions could lead to the ICMP packet being wrongfully segmented: 1. An skb is constructed by the TCP layer (e.g. gso_type SKB_GSO_TCPV4, segs >= 2). 2. The skb hits the OvS bridge where Geneve options are added by an OvS action before being sent through the tunnel. 3. When the skb is xmited in the tunnel, the split skb does not fit anymore in the MTU and iptunnel_pmtud_build_icmp is called to generate an ICMP fragmentation needed packet. This is done by reusing the original (GSO!) skb. The GSO metadata is not cleared. 4. The ICMP packet being sent back hits the OvS bridge again and because skb_is_gso returns true, it goes through queue_gso_packets... 5. ...where __skb_gso_segment is called. The skb is then dropped. 6. Note that in the above example on re-transmission the skb won't be a GSO one as it would be segmented (len > MSS) and the ICMP packet should go through. Fix this by resetting the GSO information before reusing an skb in iptunnel_pmtud_build_icmp and iptunnel_pmtud_build_icmpv6. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: Adrian Moreno Signed-off-by: Antoine Tenart Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20250904125351.159740-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/ip_tunnel_core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637..2e61ac137128 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); From 690aa09b1845c0d5c3c29dabd50a9d0488c97c48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Tue, 9 Sep 2025 11:28:29 +0200 Subject: [PATCH 184/450] ASoC: Intel: catpt: Expose correct bit depth to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently wrong bit depth is exposed in hw params, causing clipped volume during playback. Expose correct parameters. Fixes: a126750fc865 ("ASoC: Intel: catpt: PCM operations") Reported-by: Andy Shevchenko Tested-by: Andy Shevchenko Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Message-ID: <20250909092829.375953-1-amadeuszx.slawinski@linux.intel.com> Signed-off-by: Mark Brown --- sound/soc/intel/catpt/pcm.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/sound/soc/intel/catpt/pcm.c b/sound/soc/intel/catpt/pcm.c index 46acb7fdc547..bf734c69c4e0 100644 --- a/sound/soc/intel/catpt/pcm.c +++ b/sound/soc/intel/catpt/pcm.c @@ -568,8 +568,9 @@ static const struct snd_pcm_hardware catpt_pcm_hardware = { SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_NO_PERIOD_WAKEUP, .formats = SNDRV_PCM_FMTBIT_S16_LE | - SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, .period_bytes_min = PAGE_SIZE, .period_bytes_max = CATPT_BUFFER_MAX_SIZE / CATPT_PCM_PERIODS_MIN, .periods_min = CATPT_PCM_PERIODS_MIN, @@ -698,14 +699,18 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, .capture = { .stream_name = "Analog Capture", .channels_min = 2, .channels_max = 4, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -717,7 +722,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -729,7 +736,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_8000_192000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { @@ -741,7 +750,9 @@ static struct snd_soc_dai_driver dai_drivers[] = { .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, - .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE, + .formats = SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S32_LE, + .subformats = SNDRV_PCM_SUBFMTBIT_MSBITS_24 | + SNDRV_PCM_SUBFMTBIT_MSBITS_MAX, }, }, { From e895f8e29119c8c966ea794af9e9100b10becb88 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 5 Aug 2025 16:10:25 +0800 Subject: [PATCH 185/450] hrtimers: Unconditionally update target CPU base after offline timer migration When testing softirq based hrtimers on an ARM32 board, with high resolution mode and NOHZ inactive, softirq based hrtimers fail to expire after being moved away from an offline CPU: CPU0 CPU1 hrtimer_start(..., HRTIMER_MODE_SOFT); cpu_down(CPU1) ... hrtimers_cpu_dying() // Migrate timers to CPU0 smp_call_function_single(CPU0, returgger_next_event); retrigger_next_event() if (!highres && !nohz) return; As retrigger_next_event() is a NOOP when both high resolution timers and NOHZ are inactive CPU0's hrtimer_cpu_base::softirq_expires_next is not updated and the migrated softirq timers never expire unless there is a softirq based hrtimer queued on CPU0 later. Fix this by removing the hrtimer_hres_active() and tick_nohz_active() check in retrigger_next_event(), which enforces a full update of the CPU base. As this is not a fast path the extra cost does not matter. [ tglx: Massaged change log ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Xiongfeng Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805081025.54235-1-wangxiongfeng2@huawei.com --- kernel/time/hrtimer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..e8c479329282 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); From 641427d5bf90af0625081bf27555418b101274cd Mon Sep 17 00:00:00 2001 From: Alex Tran Date: Wed, 3 Sep 2025 20:17:09 -0700 Subject: [PATCH 186/450] docs: networking: can: change bcm_msg_head frames member to support flexible array The documentation of the 'bcm_msg_head' struct does not match how it is defined in 'bcm.h'. Changed the frames member to a flexible array, matching the definition in the header file. See commit 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Signed-off-by: Alex Tran Acked-by: Oliver Hartkopp Link: https://patch.msgid.link/20250904031709.1426895-1-alex.t.tran@gmail.com Fixes: 94dfc73e7cf4 ("treewide: uapi: Replace zero-length arrays with flexible-array members") Link: https://bugzilla.kernel.org/show_bug.cgi?id=217783 Signed-off-by: Marc Kleine-Budde --- Documentation/networking/can.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst index bc1b585355f7..7650c4b5be5f 100644 --- a/Documentation/networking/can.rst +++ b/Documentation/networking/can.rst @@ -742,7 +742,7 @@ The broadcast manager sends responses to user space in the same form: struct timeval ival1, ival2; /* count and subsequent interval */ canid_t can_id; /* unique can_id for task */ __u32 nframes; /* number of can_frames following */ - struct can_frame frames[0]; + struct can_frame frames[]; }; The aligned payload 'frames' uses the same basic CAN frame structure defined From 5f1af203ef964e7f7bf9d32716dfa5f332cc6f09 Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:29 +0530 Subject: [PATCH 187/450] ASoC: qcom: audioreach: Fix lpaif_type configuration for the I2S interface Fix missing lpaif_type configuration for the I2S interface. The proper lpaif interface type required to allow DSP to vote appropriate clock setting for I2S interface. Fixes: 25ab80db6b133 ("ASoC: qdsp6: audioreach: add module configuration command helpers") Cc: stable@vger.kernel.org Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Message-ID: <20250908053631.70978-2-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/audioreach.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/qdsp6/audioreach.c b/sound/soc/qcom/qdsp6/audioreach.c index 4ebaaf736fb9..3f5eed5afce5 100644 --- a/sound/soc/qcom/qdsp6/audioreach.c +++ b/sound/soc/qcom/qdsp6/audioreach.c @@ -971,6 +971,7 @@ static int audioreach_i2s_set_media_format(struct q6apm_graph *graph, param_data->param_id = PARAM_ID_I2S_INTF_CFG; param_data->param_size = ic_sz - APM_MODULE_PARAM_DATA_SIZE; + intf_cfg->cfg.lpaif_type = module->hw_interface_type; intf_cfg->cfg.intf_idx = module->hw_interface_idx; intf_cfg->cfg.sd_line_idx = module->sd_line_idx; From 33b55b94bca904ca25a9585e3cd43d15f0467969 Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:30 +0530 Subject: [PATCH 188/450] ASoC: qcom: q6apm-lpass-dais: Fix missing set_fmt DAI op for I2S The q6i2s_set_fmt() function was defined but never linked into the I2S DAI operations, resulting DAI format settings is being ignored during stream setup. This change fixes the issue by properly linking the .set_fmt handler within the DAI ops. Fixes: 30ad723b93ade ("ASoC: qdsp6: audioreach: add q6apm lpass dai support") Cc: stable@vger.kernel.org Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Message-ID: <20250908053631.70978-3-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-lpass-dais.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c index a0d90462fd6a..36c034819b38 100644 --- a/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c +++ b/sound/soc/qcom/qdsp6/q6apm-lpass-dais.c @@ -260,6 +260,7 @@ static const struct snd_soc_dai_ops q6i2s_ops = { .shutdown = q6apm_lpass_dai_shutdown, .set_channel_map = q6dma_set_channel_map, .hw_params = q6dma_hw_params, + .set_fmt = q6i2s_set_fmt, }; static const struct snd_soc_dai_ops q6hdmi_ops = { From 596e8ba2faf0d2beb9bb68801622fa6461918c1d Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Mon, 8 Sep 2025 11:06:31 +0530 Subject: [PATCH 189/450] ASoC: qcom: sc8280xp: Enable DAI format configuration for MI2S interfaces Add support for configuring the DAI format on MI2S interfaces, this enhancement allows setting the appropriate bit clock and frame clock polarity, ensuring correct audio data transmission over MI2S. Reviewed-by: Srinivas Kandagatla Signed-off-by: Mohammad Rafi Shaik Rule: add Link: https://lore.kernel.org/stable/20250908053631.70978-4-mohammad.rafi.shaik%40oss.qualcomm.com Message-ID: <20250908053631.70978-4-mohammad.rafi.shaik@oss.qualcomm.com> Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e25..3067b95bcdbb 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -32,6 +32,10 @@ static int sc8280xp_snd_init(struct snd_soc_pcm_runtime *rtd) int dp_pcm_id = 0; switch (cpu_dai->id) { + case PRIMARY_MI2S_RX...QUATERNARY_MI2S_TX: + case QUINARY_MI2S_RX...QUINARY_MI2S_TX: + snd_soc_dai_set_fmt(cpu_dai, SND_SOC_DAIFMT_BP_FP); + break; case WSA_CODEC_DMA_RX_0: case WSA_CODEC_DMA_RX_1: /* From f5c32370dba668c171c73684f489a3ea0b9503c5 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Thu, 4 Sep 2025 15:13:51 -0400 Subject: [PATCH 190/450] drm/amd/display: Disable DPCD Probe Quirk Disable dpcd probe quirk to native aux. Signed-off-by: Fangzhi Zuo Reviewed-by: Imre Deak Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4500 Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250904191351.746707-1-Jerry.Zuo@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit c5f4fb40584ee591da9fa090c6f265d11cbb1acf) Cc: stable@vger.kernel.org # 6.16.y: 5281cbe0b55a Cc: stable@vger.kernel.org # 6.16.y: 0b4aa85e8981 Cc: stable@vger.kernel.org # 6.16.y: b87ed522b364 Cc: stable@vger.kernel.org # 6.16.y --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 7187d5aedf0a..77a9d2c7d318 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -809,6 +809,7 @@ void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm, drm_dp_aux_init(&aconnector->dm_dp_aux.aux); drm_dp_cec_register_connector(&aconnector->dm_dp_aux.aux, &aconnector->base); + drm_dp_dpcd_set_probe(&aconnector->dm_dp_aux.aux, false); if (aconnector->base.connector_type == DRM_MODE_CONNECTOR_eDP) return; From 70f0b051f82d0234ade2f6753f72a2610048db3b Mon Sep 17 00:00:00 2001 From: Ovidiu Bunea Date: Mon, 25 Aug 2025 14:45:33 -0400 Subject: [PATCH 191/450] drm/amd/display: Correct sequences and delays for DCN35 PG & RCG [why] The current PG & RCG programming in driver has some gaps and incorrect sequences. [how] Added delays after ungating clocks to allow ramp up, increased polling to allow more time for power up, and removed the incorrect sequences. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Charlene Liu Signed-off-by: Ovidiu Bunea Signed-off-by: Wayne Lin Tested-by: Dan Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1bde5584e297921f45911ae874b0175dce5ed4b5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dc.h | 1 + .../amd/display/dc/dccg/dcn35/dcn35_dccg.c | 74 +++++------ .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 115 +++--------------- .../amd/display/dc/hwss/dcn35/dcn35_init.c | 3 - .../amd/display/dc/hwss/dcn351/dcn351_init.c | 3 - .../gpu/drm/amd/display/dc/inc/hw/pg_cntl.h | 1 + .../amd/display/dc/pg/dcn35/dcn35_pg_cntl.c | 78 +++++++----- 7 files changed, 111 insertions(+), 164 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 59c07756130d..f24e1da68269 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -1145,6 +1145,7 @@ struct dc_debug_options { bool enable_hblank_borrow; bool force_subvp_df_throttle; uint32_t acpi_transition_bitmasks[MAX_PIPES]; + bool enable_pg_cntl_debug_logs; }; diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c index 58c84f555c0f..0ce9489ac6b7 100644 --- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c +++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c @@ -133,30 +133,34 @@ enum dsc_clk_source { }; -static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool enable) +static void dccg35_set_dsc_clk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dsc && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_symclk32_se_rcg( @@ -385,35 +389,34 @@ static void dccg35_set_dtbclk_p_rcg(struct dccg *dccg, int inst, bool enable) } } -static void dccg35_set_dppclk_rcg(struct dccg *dccg, - int inst, bool enable) +static void dccg35_set_dppclk_rcg(struct dccg *dccg, int inst, bool allow_rcg) { - struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && enable) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && allow_rcg) return; switch (inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable ? 0 : 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, allow_rcg ? 0 : 1); break; default: BREAK_TO_DEBUGGER(); break; } - //DC_LOG_DEBUG("%s: inst(%d) DPPCLK rcg_disable: %d\n", __func__, inst, enable ? 0 : 1); + /* Wait for clock to ramp */ + if (!allow_rcg) + udelay(10); } static void dccg35_set_dpstreamclk_rcg( @@ -1177,32 +1180,34 @@ static void dccg35_update_dpp_dto(struct dccg *dccg, int dpp_inst, } static void dccg35_set_dppclk_root_clock_gating(struct dccg *dccg, - uint32_t dpp_inst, uint32_t enable) + uint32_t dpp_inst, uint32_t disallow_rcg) { struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg); - if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) + if (!dccg->ctx->dc->debug.root_clock_optimization.bits.dpp && !disallow_rcg) return; switch (dpp_inst) { case 0: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK0_ROOT_GATE_DISABLE, disallow_rcg); break; case 1: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK1_ROOT_GATE_DISABLE, disallow_rcg); break; case 2: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK2_ROOT_GATE_DISABLE, disallow_rcg); break; case 3: - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, enable); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DPPCLK3_ROOT_GATE_DISABLE, disallow_rcg); break; default: break; } - //DC_LOG_DEBUG("%s: dpp_inst(%d) rcg: %d\n", __func__, dpp_inst, enable); + /* Wait for clock to ramp */ + if (disallow_rcg) + udelay(10); } static void dccg35_get_pixel_rate_div( @@ -1782,8 +1787,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) //Disable DTO switch (inst) { case 0: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK0_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK0_DTO_PARAM, DSCCLK0_DTO_PHASE, 0, @@ -1791,8 +1795,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK0_EN, 1); break; case 1: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK1_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK1_DTO_PARAM, DSCCLK1_DTO_PHASE, 0, @@ -1800,8 +1803,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK1_EN, 1); break; case 2: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK2_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK2_DTO_PARAM, DSCCLK2_DTO_PHASE, 0, @@ -1809,8 +1811,7 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) REG_UPDATE(DSCCLK_DTO_CTRL, DSCCLK2_EN, 1); break; case 3: - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dsc) - REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); + REG_UPDATE(DCCG_GATE_DISABLE_CNTL6, DSCCLK3_ROOT_GATE_DISABLE, 1); REG_UPDATE_2(DSCCLK3_DTO_PARAM, DSCCLK3_DTO_PHASE, 0, @@ -1821,6 +1822,9 @@ static void dccg35_enable_dscclk(struct dccg *dccg, int inst) BREAK_TO_DEBUGGER(); return; } + + /* Wait for clock to ramp */ + udelay(10); } static void dccg35_disable_dscclk(struct dccg *dccg, @@ -1864,6 +1868,9 @@ static void dccg35_disable_dscclk(struct dccg *dccg, default: return; } + + /* Wait for clock ramp */ + udelay(10); } static void dccg35_enable_symclk_se(struct dccg *dccg, uint32_t stream_enc_inst, uint32_t link_enc_inst) @@ -2349,10 +2356,7 @@ static void dccg35_disable_symclk_se_cb( void dccg35_root_gate_disable_control(struct dccg *dccg, uint32_t pipe_idx, uint32_t disable_clock_gating) { - - if (dccg->ctx->dc->debug.root_clock_optimization.bits.dpp) { - dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); - } + dccg35_set_dppclk_root_clock_gating(dccg, pipe_idx, disable_clock_gating); } static const struct dccg_funcs dccg35_funcs_new = { diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index a267f574b619..764eff6a4ec6 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -113,6 +113,14 @@ static void enable_memory_low_power(struct dc *dc) } #endif +static void print_pg_status(struct dc *dc, const char *debug_func, const char *debug_log) +{ + if (dc->debug.enable_pg_cntl_debug_logs && dc->res_pool->pg_cntl) { + if (dc->res_pool->pg_cntl->funcs->print_pg_status) + dc->res_pool->pg_cntl->funcs->print_pg_status(dc->res_pool->pg_cntl, debug_func, debug_log); + } +} + void dcn35_set_dmu_fgcg(struct dce_hwseq *hws, bool enable) { REG_UPDATE_3(DMU_CLK_CNTL, @@ -137,6 +145,8 @@ void dcn35_init_hw(struct dc *dc) uint32_t user_level = MAX_BACKLIGHT_LEVEL; int i; + print_pg_status(dc, __func__, ": start"); + if (dc->clk_mgr && dc->clk_mgr->funcs->init_clocks) dc->clk_mgr->funcs->init_clocks(dc->clk_mgr); @@ -200,10 +210,7 @@ void dcn35_init_hw(struct dc *dc) /* we want to turn off all dp displays before doing detection */ dc->link_srv->blank_all_dp_displays(dc); -/* - if (hws->funcs.enable_power_gating_plane) - hws->funcs.enable_power_gating_plane(dc->hwseq, true); -*/ + if (res_pool->hubbub && res_pool->hubbub->funcs->dchubbub_init) res_pool->hubbub->funcs->dchubbub_init(dc->res_pool->hubbub); /* If taking control over from VBIOS, we may want to optimize our first @@ -236,6 +243,8 @@ void dcn35_init_hw(struct dc *dc) } hws->funcs.init_pipes(dc, dc->current_state); + print_pg_status(dc, __func__, ": after init_pipes"); + if (dc->res_pool->hubbub->funcs->allow_self_refresh_control && !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter) dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub, @@ -312,6 +321,7 @@ void dcn35_init_hw(struct dc *dc) if (dc->res_pool->pg_cntl->funcs->init_pg_status) dc->res_pool->pg_cntl->funcs->init_pg_status(dc->res_pool->pg_cntl); } + print_pg_status(dc, __func__, ": after init_pg_status"); } static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable) @@ -500,97 +510,6 @@ void dcn35_physymclk_root_clock_control(struct dce_hwseq *hws, unsigned int phy_ } } -void dcn35_dsc_pg_control( - struct dce_hwseq *hws, - unsigned int dsc_inst, - bool power_on) -{ - uint32_t power_gate = power_on ? 0 : 1; - uint32_t pwr_status = power_on ? 0 : 2; - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_dsc_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - - switch (dsc_inst) { - case 0: /* DSC0 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN16_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 1: /* DSC1 */ - REG_UPDATE(DOMAIN17_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN17_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 2: /* DSC2 */ - REG_UPDATE(DOMAIN18_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN18_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - case 3: /* DSC3 */ - REG_UPDATE(DOMAIN19_PG_CONFIG, - DOMAIN_POWER_GATE, power_gate); - - REG_WAIT(DOMAIN19_PG_STATUS, - DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); - break; - default: - BREAK_TO_DEBUGGER(); - break; - } - - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 0); -} - -void dcn35_enable_power_gating_plane(struct dce_hwseq *hws, bool enable) -{ - bool force_on = true; /* disable power gating */ - uint32_t org_ip_request_cntl = 0; - - if (hws->ctx->dc->debug.disable_hubp_power_gate) - return; - if (hws->ctx->dc->debug.ignore_pg) - return; - REG_GET(DC_IP_REQUEST_CNTL, IP_REQUEST_EN, &org_ip_request_cntl); - if (org_ip_request_cntl == 0) - REG_SET(DC_IP_REQUEST_CNTL, 0, IP_REQUEST_EN, 1); - /* DCHUBP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - /* DPP0/1/2/3/4/5 */ - REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - force_on = true; /* disable power gating */ - if (enable && !hws->ctx->dc->debug.disable_dsc_power_gate) - force_on = false; - - /* DCS0/1/2/3/4 */ - REG_UPDATE(DOMAIN16_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN17_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN18_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - REG_UPDATE(DOMAIN19_PG_CONFIG, DOMAIN_POWER_FORCEON, force_on); - - -} - /* In headless boot cases, DIG may be turned * on which causes HW/SW discrepancies. * To avoid this, power down hardware on boot @@ -1453,6 +1372,8 @@ void dcn35_prepare_bandwidth( } dcn20_prepare_bandwidth(dc, context); + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_optimize_bandwidth( @@ -1461,6 +1382,8 @@ void dcn35_optimize_bandwidth( { struct pg_block_update pg_update_state; + print_pg_status(dc, __func__, ": before rcg and power up"); + dcn20_optimize_bandwidth(dc, context); if (dc->hwss.calc_blocks_to_gate) { @@ -1472,6 +1395,8 @@ void dcn35_optimize_bandwidth( if (dc->hwss.root_clock_control) dc->hwss.root_clock_control(dc, &pg_update_state, false); } + + print_pg_status(dc, __func__, ": after rcg and power up"); } void dcn35_set_drr(struct pipe_ctx **pipe_ctx, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c index a3ccf805bd16..aefb7c473741 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c @@ -115,7 +115,6 @@ static const struct hw_sequencer_funcs dcn35_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn35_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn35_calc_blocks_to_ungate, .hw_block_power_up = dcn35_hw_block_power_up, @@ -150,7 +149,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -165,7 +163,6 @@ static const struct hwseq_private_funcs dcn35_private_funcs = { .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .resync_fifo_dccg_dio = dcn314_resync_fifo_dccg_dio, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c index 58f2be2a326b..a580a55695c3 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn351/dcn351_init.c @@ -114,7 +114,6 @@ static const struct hw_sequencer_funcs dcn351_funcs = { .exit_optimized_pwr_state = dcn21_exit_optimized_pwr_state, .update_visual_confirm_color = dcn10_update_visual_confirm_color, .apply_idle_power_optimizations = dcn35_apply_idle_power_optimizations, - .update_dsc_pg = dcn32_update_dsc_pg, .calc_blocks_to_gate = dcn351_calc_blocks_to_gate, .calc_blocks_to_ungate = dcn351_calc_blocks_to_ungate, .hw_block_power_up = dcn351_hw_block_power_up, @@ -145,7 +144,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .plane_atomic_disable = dcn35_plane_atomic_disable, //.plane_atomic_disable = dcn20_plane_atomic_disable,/*todo*/ //.hubp_pg_control = dcn35_hubp_pg_control, - .enable_power_gating_plane = dcn35_enable_power_gating_plane, .dpp_root_clock_control = dcn35_dpp_root_clock_control, .dpstream_root_clock_control = dcn35_dpstream_root_clock_control, .physymclk_root_clock_control = dcn35_physymclk_root_clock_control, @@ -159,7 +157,6 @@ static const struct hwseq_private_funcs dcn351_private_funcs = { .setup_hpo_hw_control = dcn35_setup_hpo_hw_control, .calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values, .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy, - .dsc_pg_control = dcn35_dsc_pg_control, .dsc_pg_status = dcn32_dsc_pg_status, .enable_plane = dcn35_enable_plane, .wait_for_pipe_update_if_needed = dcn10_wait_for_pipe_update_if_needed, diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h index 44f86cc2d1d6..227e3f8d7e5f 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/pg_cntl.h @@ -49,6 +49,7 @@ struct pg_cntl_funcs { void (*mem_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*dio_pg_control)(struct pg_cntl *pg_cntl, bool power_on); void (*init_pg_status)(struct pg_cntl *pg_cntl); + void (*print_pg_status)(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log); }; #endif //__DC_PG_CNTL_H__ diff --git a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c index af21c0a27f86..72bd43f9bbe2 100644 --- a/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c +++ b/drivers/gpu/drm/amd/display/dc/pg/dcn35/dcn35_pg_cntl.c @@ -79,16 +79,12 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo uint32_t power_gate = power_on ? 0 : 1; uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl = 0; - bool block_enabled; + bool block_enabled = false; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_dsc_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - /*need to enable dscclk regardless DSC_PG*/ - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc && power_on) - pg_cntl->ctx->dc->res_pool->dccg->funcs->enable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); - - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_dsc_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, dsc_inst); @@ -111,7 +107,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN16_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 1: /* DSC1 */ REG_UPDATE(DOMAIN17_PG_CONFIG, @@ -119,7 +115,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN17_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 2: /* DSC2 */ REG_UPDATE(DOMAIN18_PG_CONFIG, @@ -127,7 +123,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN18_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; case 3: /* DSC3 */ REG_UPDATE(DOMAIN19_PG_CONFIG, @@ -135,7 +131,7 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo REG_WAIT(DOMAIN19_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, - 1, 1000); + 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -144,12 +140,6 @@ void pg_cntl35_dsc_pg_control(struct pg_cntl *pg_cntl, unsigned int dsc_inst, bo if (dsc_inst < MAX_PIPES) pg_cntl->pg_pipe_res_enable[PG_DSC][dsc_inst] = power_on; - - if (pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc && !power_on) { - /*this is to disable dscclk*/ - pg_cntl->ctx->dc->res_pool->dccg->funcs->disable_dsc( - pg_cntl->ctx->dc->res_pool->dccg, dsc_inst); - } } static bool pg_cntl35_hubp_dpp_pg_status(struct pg_cntl *pg_cntl, unsigned int hubp_dpp_inst) @@ -189,11 +179,12 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp uint32_t pwr_status = power_on ? 0 : 2; uint32_t org_ip_request_cntl; bool block_enabled; + bool skip_pg = pg_cntl->ctx->dc->debug.ignore_pg || + pg_cntl->ctx->dc->debug.disable_hubp_power_gate || + pg_cntl->ctx->dc->debug.disable_dpp_power_gate || + pg_cntl->ctx->dc->idle_optimizations_allowed; - if (pg_cntl->ctx->dc->debug.ignore_pg || - pg_cntl->ctx->dc->debug.disable_hubp_power_gate || - pg_cntl->ctx->dc->debug.disable_dpp_power_gate || - pg_cntl->ctx->dc->idle_optimizations_allowed) + if (skip_pg && !power_on) return; block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, hubp_dpp_inst); @@ -213,22 +204,22 @@ void pg_cntl35_hubp_dpp_pg_control(struct pg_cntl *pg_cntl, unsigned int hubp_dp case 0: /* DPP0 & HUBP0 */ REG_UPDATE(DOMAIN0_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN0_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 1: /* DPP1 & HUBP1 */ REG_UPDATE(DOMAIN1_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN1_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 2: /* DPP2 & HUBP2 */ REG_UPDATE(DOMAIN2_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN2_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; case 3: /* DPP3 & HUBP3 */ REG_UPDATE(DOMAIN3_PG_CONFIG, DOMAIN_POWER_GATE, power_gate); - REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 1000); + REG_WAIT(DOMAIN3_PG_STATUS, DOMAIN_PGFSM_PWR_STATUS, pwr_status, 1, 10000); break; default: BREAK_TO_DEBUGGER(); @@ -501,6 +492,36 @@ void pg_cntl35_init_pg_status(struct pg_cntl *pg_cntl) pg_cntl->pg_res_enable[PG_DWB] = block_enabled; } +static void pg_cntl35_print_pg_status(struct pg_cntl *pg_cntl, const char *debug_func, const char *debug_log) +{ + int i = 0; + bool block_enabled = false; + + DC_LOG_DEBUG("%s: %s", debug_func, debug_log); + + DC_LOG_DEBUG("PG_CNTL status:\n"); + + block_enabled = pg_cntl35_io_clk_status(pg_cntl); + DC_LOG_DEBUG("ONO0=%d (DCCG, DIO, DCIO)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_mem_status(pg_cntl); + DC_LOG_DEBUG("ONO1=%d (DCHUBBUB, DCHVM, DCHUBBUBMEM)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_plane_otg_status(pg_cntl); + DC_LOG_DEBUG("ONO2=%d (MPC, OPP, OPTC, DWB)\n", block_enabled ? 1 : 0); + + block_enabled = pg_cntl35_hpo_pg_status(pg_cntl); + DC_LOG_DEBUG("ONO3=%d (HPO)\n", block_enabled ? 1 : 0); + + for (i = 0; i < pg_cntl->ctx->dc->res_pool->pipe_count; i++) { + block_enabled = pg_cntl35_hubp_dpp_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DCHUBP%d, DPP%d)\n", 4 + i * 2, block_enabled ? 1 : 0, i, i); + + block_enabled = pg_cntl35_dsc_pg_status(pg_cntl, i); + DC_LOG_DEBUG("ONO%d=%d (DSC%d)\n", 5 + i * 2, block_enabled ? 1 : 0, i); + } +} + static const struct pg_cntl_funcs pg_cntl35_funcs = { .init_pg_status = pg_cntl35_init_pg_status, .dsc_pg_control = pg_cntl35_dsc_pg_control, @@ -511,7 +532,8 @@ static const struct pg_cntl_funcs pg_cntl35_funcs = { .mpcc_pg_control = pg_cntl35_mpcc_pg_control, .opp_pg_control = pg_cntl35_opp_pg_control, .optc_pg_control = pg_cntl35_optc_pg_control, - .dwb_pg_control = pg_cntl35_dwb_pg_control + .dwb_pg_control = pg_cntl35_dwb_pg_control, + .print_pg_status = pg_cntl35_print_pg_status }; struct pg_cntl *pg_cntl35_create( From 60f71f0db7b12f303789ef59949e38ee5838ee8b Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Fri, 5 Sep 2025 10:36:27 -0500 Subject: [PATCH 192/450] drm/amd/display: Drop dm_prepare_suspend() and dm_complete() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] dm_prepare_suspend() was added in commit 50e0bae34fa6b ("drm/amd/display: Add and use new dm_prepare_suspend() callback") to allow display to turn off earlier in the suspend sequence. This caused a regression that HDMI audio sometimes didn't work properly after resume unless audio was playing during suspend. [How] Drop dm_prepare_suspend() callback. All code in it will still run during dm_suspend(). Also drop unnecessary dm_complete() callback. dm_complete() was used for failed prepare and also for any case of successful resume. The code in it already runs in dm_resume(). This change will introduce more time that the display is turned on during suspend sequence. The compositor can turn it off sooner if desired. Cc: Harry Wentland Reported-by: Przemysław Kopa Closes: https://lore.kernel.org/amd-gfx/1cea0d56-7739-4ad9-bf8e-c9330faea2bb@kernel.org/T/#m383d9c08397043a271b36c32b64bb80e524e4b0f Reported-by: Kalvin Closes: https://github.com/alsa-project/alsa-lib/issues/465 Closes: https://gitlab.freedesktop.org/pipewire/pipewire/-/issues/4809 Fixes: 50e0bae34fa6b ("drm/amd/display: Add and use new dm_prepare_suspend() callback") Signed-off-by: Mario Limonciello Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 2fd653b9bb5aacec5d4c421ab290905898fe85a2) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7808a647a306..352b3dcd0e0e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3127,25 +3127,6 @@ static void dm_destroy_cached_state(struct amdgpu_device *adev) dm->cached_state = NULL; } -static void dm_complete(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - dm_destroy_cached_state(adev); -} - -static int dm_prepare_suspend(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - if (amdgpu_in_reset(adev)) - return 0; - - WARN_ON(adev->dm.cached_state); - - return dm_cache_state(adev); -} - static int dm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; @@ -3571,10 +3552,8 @@ static const struct amd_ip_funcs amdgpu_dm_funcs = { .early_fini = amdgpu_dm_early_fini, .hw_init = dm_hw_init, .hw_fini = dm_hw_fini, - .prepare_suspend = dm_prepare_suspend, .suspend = dm_suspend, .resume = dm_resume, - .complete = dm_complete, .is_idle = dm_is_idle, .wait_for_idle = dm_wait_for_idle, .check_soft_reset = dm_check_soft_reset, From 1dfd2864a1c4909147663e5a27c055f50f7c2796 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Thu, 28 Aug 2025 22:26:22 +1000 Subject: [PATCH 193/450] drm/amd/display: remove oem i2c adapter on finish Fixes a bug where unbinding of the GPU would leave the oem i2c adapter registered resulting in a null pointer dereference when applications try to access the invalid device. Fixes: 3d5470c97314 ("drm/amd/display/dm: add support for OEM i2c bus") Cc: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Geoffrey McRae Signed-off-by: Alex Deucher (cherry picked from commit 89923fb7ead4fdd37b78dd49962d9bb5892403e6) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 352b3dcd0e0e..4e86370ae705 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2913,6 +2913,17 @@ static int dm_oem_i2c_hw_init(struct amdgpu_device *adev) return 0; } +static void dm_oem_i2c_hw_fini(struct amdgpu_device *adev) +{ + struct amdgpu_display_manager *dm = &adev->dm; + + if (dm->oem_i2c) { + i2c_del_adapter(&dm->oem_i2c->base); + kfree(dm->oem_i2c); + dm->oem_i2c = NULL; + } +} + /** * dm_hw_init() - Initialize DC device * @ip_block: Pointer to the amdgpu_ip_block for this hw instance. @@ -2963,7 +2974,7 @@ static int dm_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - kfree(adev->dm.oem_i2c); + dm_oem_i2c_hw_fini(adev); amdgpu_dm_hpd_fini(adev); From ce42a3b581a9db10765eb835840b04dbe7972135 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Mon, 25 Aug 2025 09:50:49 -0400 Subject: [PATCH 194/450] drm/amdkfd: fix p2p links bug in topology When creating p2p links, KFD needs to check XGMI link with two conditions, hive_id and is_sharing_enabled, but it is missing to check is_sharing_enabled, so add it to fix the error. Signed-off-by: Eric Huang Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 36cc7d13178d901982da7a122c883861d98da624) --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 4ec73f33535e..720b20e842ba 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1587,7 +1587,8 @@ static int kfd_dev_create_p2p_links(void) break; if (!dev->gpu || !dev->gpu->adev || (dev->gpu->kfd->hive_id && - dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id)) + dev->gpu->kfd->hive_id == new_dev->gpu->kfd->hive_id && + amdgpu_xgmi_get_is_sharing_enabled(dev->gpu->adev, new_dev->gpu->adev))) goto next; /* check if node(s) is/are peer accessible in one direction or bi-direction */ From 53503556273a5ead8b75534085e2dcb46e96f883 Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Wed, 20 Aug 2025 16:10:51 +0800 Subject: [PATCH 195/450] amd/amdkfd: correct mem limit calculation for small APUs Current mem limit check leaks some GTT memory (reserved_for_pt reserved_for_ras + adev->vram_pin_size) for small APUs. Since carveout VRAM is tunable on APUs, there are three case regarding the carveout VRAM size relative to GTT: 1. 0 < carveout < gtt apu_prefer_gtt = true, is_app_apu = false 2. carveout > gtt / 2 apu_prefer_gtt = false, is_app_apu = false 3. 0 = carveout apu_prefer_gtt = true, is_app_apu = true It doesn't make sense to check below limitation in case 1 (default case, small carveout) because the values in the below expression are mixed with carveout and gtt. adev->kfd.vram_used[xcp_id] + vram_needed > vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size) gtt: kfd.vram_used, vram_needed, vram_size carveout: reserved_for_pt, reserved_for_ras, adev->vram_pin_size In case 1, vram allocation will go to gtt domain, skip vram check since ttm_mem_limit check already cover this allocation. Signed-off-by: Yifan Zhang Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit fa7c99f04f6dd299388e9282812b14e95558ac8e) --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 260165bbe373..b16cce7c22c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -213,19 +213,35 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, spin_lock(&kfd_mem_limit.mem_limit_lock); if (kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit) + kfd_mem_limit.max_system_mem_limit) { pr_debug("Set no_system_mem_limit=1 if using shared memory\n"); + if (!no_system_mem_limit) { + ret = -ENOMEM; + goto release; + } + } - if ((kfd_mem_limit.system_mem_used + system_mem_needed > - kfd_mem_limit.max_system_mem_limit && !no_system_mem_limit) || - (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > - kfd_mem_limit.max_ttm_mem_limit) || - (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > - vram_size - reserved_for_pt - reserved_for_ras - atomic64_read(&adev->vram_pin_size))) { + if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed > + kfd_mem_limit.max_ttm_mem_limit) { ret = -ENOMEM; goto release; } + /*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with + * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip + * VRAM check since ttm_mem_limit check already cover this allocation + */ + + if (adev && xcp_id >= 0 && (!adev->apu_prefer_gtt || adev->gmc.is_app_apu)) { + uint64_t vram_available = + vram_size - reserved_for_pt - reserved_for_ras - + atomic64_read(&adev->vram_pin_size); + if (adev->kfd.vram_used[xcp_id] + vram_needed > vram_available) { + ret = -ENOMEM; + goto release; + } + } + /* Update memory accounting by decreasing available system * memory, TTM memory and GPU memory as computed above */ @@ -1626,11 +1642,15 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev, uint64_t vram_available, system_mem_available, ttm_mem_available; spin_lock(&kfd_mem_limit.mem_limit_lock); - vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) - - adev->kfd.vram_used_aligned[xcp_id] - - atomic64_read(&adev->vram_pin_size) - - reserved_for_pt - - reserved_for_ras; + if (adev->apu_prefer_gtt && !adev->gmc.is_app_apu) + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id]; + else + vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id) + - adev->kfd.vram_used_aligned[xcp_id] + - atomic64_read(&adev->vram_pin_size) + - reserved_for_pt + - reserved_for_ras; if (adev->apu_prefer_gtt) { system_mem_available = no_system_mem_limit ? From 75871a525a596ff4d16c4aebc0018f8d0923c9b1 Mon Sep 17 00:00:00 2001 From: Tianyu Xu Date: Tue, 12 Aug 2025 21:10:56 +0800 Subject: [PATCH 196/450] igb: Fix NULL pointer dereference in ethtool loopback test The igb driver currently causes a NULL pointer dereference when executing the ethtool loopback test. This occurs because there is no associated q_vector for the test ring when it is set up, as interrupts are typically not added to the test rings. Since commit 5ef44b3cb43b removed the napi_id assignment in __xdp_rxq_info_reg(), there is no longer a need to pass a napi_id to it. Therefore, simply use 0 as the last parameter. Fixes: 2c6196013f84 ("igb: Add AF_XDP zero-copy Rx support") Reviewed-by: Aleksandr Loktionov Reviewed-by: Joe Damato Signed-off-by: Tianyu Xu Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a9a7a94ae61e..453deb6d14b3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4453,8 +4453,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) xdp_rxq_info_unreg(&rx_ring->xdp_rxq); res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, - rx_ring->queue_index, - rx_ring->q_vector->napi.napi_id); + rx_ring->queue_index, 0); if (res < 0) { dev_err(dev, "Failed to register xdp_rxq index %u\n", rx_ring->queue_index); From d709f178abca22a4d3642513df29afe4323a594b Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Fri, 15 Aug 2025 15:26:31 +0900 Subject: [PATCH 197/450] igb: fix link test skipping when interface is admin down The igb driver incorrectly skips the link test when the network interface is admin down (if_running == false), causing the test to always report PASS regardless of the actual physical link state. This behavior is inconsistent with other drivers (e.g. i40e, ice, ixgbe, etc.) which correctly test the physical link state regardless of admin state. Remove the if_running check to ensure link test always reflects the physical link state. Fixes: 8d420a1b3ea6 ("igb: correct link test not being run when link is down") Signed-off-by: Kohei Enju Reviewed-by: Paul Menzel Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_ethtool.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 92ef33459aec..7b8f32c5169a 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -2081,11 +2081,8 @@ static void igb_diag_test(struct net_device *netdev, } else { dev_info(&adapter->pdev->dev, "online testing starting\n"); - /* PHY is powered down when interface is down */ - if (if_running && igb_link_test(adapter, &data[TEST_LINK])) + if (igb_link_test(adapter, &data[TEST_LINK])) eth_test->flags |= ETH_TEST_FL_FAILED; - else - data[TEST_LINK] = 0; /* Online tests aren't run; pass by default */ data[TEST_REG] = 0; From 915470e1b44e71d1dd07ee067276f003c3521ee3 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 18 Aug 2025 17:39:03 +0200 Subject: [PATCH 198/450] i40e: fix IRQ freeing in i40e_vsi_request_irq_msix error path If request_irq() in i40e_vsi_request_irq_msix() fails in an iteration later than the first, the error path wants to free the IRQs requested so far. However, it uses the wrong dev_id argument for free_irq(), so it does not free the IRQs correctly and instead triggers the warning: Trying to free already-free IRQ 173 WARNING: CPU: 25 PID: 1091 at kernel/irq/manage.c:1829 __free_irq+0x192/0x2c0 Modules linked in: i40e(+) [...] CPU: 25 UID: 0 PID: 1091 Comm: NetworkManager Not tainted 6.17.0-rc1+ #1 PREEMPT(lazy) Hardware name: [...] RIP: 0010:__free_irq+0x192/0x2c0 [...] Call Trace: free_irq+0x32/0x70 i40e_vsi_request_irq_msix.cold+0x63/0x8b [i40e] i40e_vsi_request_irq+0x79/0x80 [i40e] i40e_vsi_open+0x21f/0x2f0 [i40e] i40e_open+0x63/0x130 [i40e] __dev_open+0xfc/0x210 __dev_change_flags+0x1fc/0x240 netif_change_flags+0x27/0x70 do_setlink.isra.0+0x341/0xc70 rtnl_newlink+0x468/0x860 rtnetlink_rcv_msg+0x375/0x450 netlink_rcv_skb+0x5c/0x110 netlink_unicast+0x288/0x3c0 netlink_sendmsg+0x20d/0x430 ____sys_sendmsg+0x3a2/0x3d0 ___sys_sendmsg+0x99/0xe0 __sys_sendmsg+0x8a/0xf0 do_syscall_64+0x82/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e [...] ---[ end trace 0000000000000000 ]--- Use the same dev_id for free_irq() as for request_irq(). I tested this with inserting code to fail intentionally. Fixes: 493fb30011b3 ("i40e: Move q_vectors from pointer to array to array of pointers") Signed-off-by: Michal Schmidt Reviewed-by: Aleksandr Loktionov Reviewed-by: Subbaraya Sundeep Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b83f823e4917..dd21d93d39dd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -4156,7 +4156,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) irq_num = pf->msix_entries[base + vector].vector; irq_set_affinity_notifier(irq_num, NULL); irq_update_affinity_hint(irq_num, NULL); - free_irq(irq_num, &vsi->q_vectors[vector]); + free_irq(irq_num, vsi->q_vectors[vector]); } return err; } From 7934fdc25ad642ab3dbc16d734ab58638520ea60 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 4 Sep 2025 12:35:21 +0200 Subject: [PATCH 199/450] drm/xe/configfs: Don't touch survivability_mode on fini This is a user controlled configfs attribute, we should not modify that outside the configfs attr.store() implementation. Fixes: bc417e54e24b ("drm/xe: Enable configfs support for survivability mode") Signed-off-by: Michal Wajdeczko Cc: Lucas De Marchi Cc: Riana Tauro Reviewed-by: Stuart Summers Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250904103521.7130-1-michal.wajdeczko@intel.com (cherry picked from commit 079a5c83dbd23db7a6eed8f558cf75e264d8a17b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_survivability_mode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 41705f5d52e3..8f7b0add2364 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -41,6 +41,8 @@ * * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode * + * It is the responsibility of the user to clear the mode once firmware flash is complete. + * * Refer :ref:`xe_configfs` for more details on how to use configfs * * Survivability mode is indicated by the below admin-only readable sysfs which provides additional @@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg) struct pci_dev *pdev = to_pci_dev(xe->drm.dev); struct device *dev = &pdev->dev; - xe_configfs_clear_survivability_mode(pdev); sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); } From 5c87fee3c96ce898ad681552404a66c7605193c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:13 +0200 Subject: [PATCH 200/450] drm/xe: Attempt to bring bos back to VRAM after eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VRAM+TT bos that are evicted from VRAM to TT may remain in TT also after a revalidation following eviction or suspend. This manifests itself as applications becoming sluggish after buffer objects get evicted or after a resume from suspend or hibernation. If the bo supports placement in both VRAM and TT, and we are on DGFX, mark the TT placement as fallback. This means that it is tried only after VRAM + eviction. This flaw has probably been present since the xe module was upstreamed but use a Fixes: commit below where backporting is likely to be simple. For earlier versions we need to open- code the fallback algorithm in the driver. v2: - Remove check for dgfx. (Matthew Auld) - Update the xe_dma_buf kunit test for the new strategy (CI) - Allow dma-buf to pin in current placement (CI) - Make xe_bo_validate() for pinned bos a NOP. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5995 Fixes: a78a8da51b36 ("drm/ttm: replace busy placement with flags v6") Cc: Matthew Brost Cc: Matthew Auld Cc: # v6.9+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-2-thomas.hellstrom@linux.intel.com (cherry picked from commit cb3d7b3b46b799c96b54f8e8fe36794a55a77f0b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/tests/xe_bo.c | 2 +- drivers/gpu/drm/xe/tests/xe_dma_buf.c | 10 +--------- drivers/gpu/drm/xe/xe_bo.c | 16 ++++++++++++---- drivers/gpu/drm/xe/xe_bo.h | 2 +- drivers/gpu/drm/xe/xe_dma_buf.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c index bb469096d072..7b40cc8be1c9 100644 --- a/drivers/gpu/drm/xe/tests/xe_bo.c +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc } xe_bo_lock(external, false); - err = xe_bo_pin_external(external); + err = xe_bo_pin_external(external, false); xe_bo_unlock(external); if (err) { KUNIT_FAIL(test, "external bo pin err=%pe\n", diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c index c53f67ce4b0a..121f17c112ec 100644 --- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c @@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported, return; } - /* - * If on different devices, the exporter is kept in system if - * possible, saving a migration step as the transfer is just - * likely as fast from system memory. - */ - if (params->mem_mask & XE_BO_FLAG_SYSTEM) - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT)); - else - KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); if (params->force_different_devices) KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT)); diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 9954bb458ce1..bae7ff2e5927 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo, bo->placements[*c] = (struct ttm_place) { .mem_type = XE_PL_TT, + .flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ? + TTM_PL_FLAG_FALLBACK : 0, }; *c += 1; } @@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) /** * xe_bo_pin_external - pin an external BO * @bo: buffer object to be pinned + * @in_place: Pin in current placement, don't attempt to migrate. * * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD) * BO. Unique call compared to xe_bo_pin as this function has it own set of @@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res) * * Returns 0 for success, negative error code otherwise. */ -int xe_bo_pin_external(struct xe_bo *bo) +int xe_bo_pin_external(struct xe_bo *bo, bool in_place) { struct xe_device *xe = xe_bo_device(bo); int err; @@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo) xe_assert(xe, xe_bo_is_user(bo)); if (!xe_bo_is_pinned(bo)) { - err = xe_bo_validate(bo, NULL, false); - if (err) - return err; + if (!in_place) { + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + } spin_lock(&xe->pinned.lock); list_add_tail(&bo->pinned_link, &xe->pinned.late.external); @@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict) }; int ret; + if (xe_bo_is_pinned(bo)) + return 0; + if (vm) { lockdep_assert_held(&vm->lock); xe_vm_assert_held(vm); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index 02e8cde4c6b2..9ce94d252015 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo) } } -int xe_bo_pin_external(struct xe_bo *bo); +int xe_bo_pin_external(struct xe_bo *bo, bool in_place); int xe_bo_pin(struct xe_bo *bo); void xe_bo_unpin_external(struct xe_bo *bo); void xe_bo_unpin(struct xe_bo *bo); diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 346f857f3837..af64baf872ef 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach) return ret; } - ret = xe_bo_pin_external(bo); + ret = xe_bo_pin_external(bo, true); xe_assert(xe, !ret); return 0; From d84820309ed34cc412ce76ecfa9471dae7d7d144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:14 +0200 Subject: [PATCH 201/450] drm/xe: Allow the pm notifier to continue on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its actions are opportunistic anyway and will be completed on device suspend. Marking as a fix to simplify backporting of the fix that follows in the series. v2: - Keep the runtime pm reference over suspend / hibernate and document why. (Matt Auld, Rodrigo Vivi): Fixes: c6a4d46ec1d7 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: # v6.16+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-3-thomas.hellstrom@linux.intel.com (cherry picked from commit ebd546fdffddfcaeab08afdd68ec93052c8fa740) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pm.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index e279b47ba03b..c9d5f0a21c48 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -301,17 +301,17 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: xe_pm_runtime_get(xe); err = xe_bo_evict_all_user(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err); - xe_pm_runtime_put(xe); - break; - } err = xe_bo_notifier_prepare_all_pinned(xe); - if (err) { + if (err) drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err); - xe_pm_runtime_put(xe); - } + /* + * Keep the runtime pm reference until post hibernation / post suspend to + * avoid a runtime suspend interfering with evicted objects or backup + * allocations. + */ break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: @@ -320,9 +320,6 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, break; } - if (err) - return NOTIFY_BAD; - return NOTIFY_DONE; } From eb5723a75104605b7d2207a7d598e314166fbef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Thu, 4 Sep 2025 18:07:15 +0200 Subject: [PATCH 202/450] drm/xe: Block exec and rebind worker while evicting for suspend / hibernate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the xe pm_notifier evicts for suspend / hibernate, there might be racing tasks trying to re-validate again. This can lead to suspend taking excessive time or get stuck in a live-lock. This behaviour becomes much worse with the fix that actually makes re-validation bring back bos to VRAM rather than letting them remain in TT. Prevent that by having exec and the rebind worker waiting for a completion that is set to block by the pm_notifier before suspend and is signaled by the pm_notifier after resume / wakeup. It's probably still possible to craft malicious applications that block suspending. More work is pending to fix that. v3: - Avoid wait_for_completion() in the kernel worker since it could potentially cause work item flushes from freezable processes to wait forever. Instead terminate the rebind workers if needed and re-launch at resume. (Matt Auld) v4: - Fix some bad naming and leftover debug printouts. - Fix kerneldoc. - Use drmm_mutex_init() for the xe->rebind_resume_lock (Matt Auld). - Rework the interface of xe_vm_rebind_resume_worker (Matt Auld). Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4288 Fixes: c6a4d46ec1d7 ("drm/xe: evict user memory in PM notifier") Cc: Matthew Auld Cc: Rodrigo Vivi Cc: # v6.16+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Link: https://lore.kernel.org/r/20250904160715.2613-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 599334572a5a99111015fbbd5152ce4dedc2f8b7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device_types.h | 6 ++++ drivers/gpu/drm/xe/xe_exec.c | 9 ++++++ drivers/gpu/drm/xe/xe_pm.c | 25 +++++++++++++++++ drivers/gpu/drm/xe/xe_vm.c | 42 +++++++++++++++++++++++++++- drivers/gpu/drm/xe/xe_vm.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 ++++ 6 files changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index d4d2c6854790..7ceb0c90f391 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -553,6 +553,12 @@ struct xe_device { /** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */ struct notifier_block pm_notifier; + /** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */ + struct completion pm_block; + /** @rebind_resume_list: List of wq items to kick on resume. */ + struct list_head rebind_resume_list; + /** @rebind_resume_lock: Lock to protect the rebind_resume_list */ + struct mutex rebind_resume_lock; /** @pmt: Support the PMT driver callback interface */ struct { diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 44364c042ad7..374c831e691b 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -237,6 +237,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) goto err_unlock_list; } + /* + * It's OK to block interruptible here with the vm lock held, since + * on task freezing during suspend / hibernate, the call will + * return -ERESTARTSYS and the IOCTL will be rerun. + */ + err = wait_for_completion_interruptible(&xe->pm_block); + if (err) + goto err_unlock_list; + vm_exec.vm = &vm->gpuvm; vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; if (xe_vm_in_lr_mode(vm)) { diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c index c9d5f0a21c48..bb9b6ecad2af 100644 --- a/drivers/gpu/drm/xe/xe_pm.c +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -24,6 +24,7 @@ #include "xe_pcode.h" #include "xe_pxp.h" #include "xe_trace.h" +#include "xe_vm.h" #include "xe_wa.h" /** @@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe) return DEFAULT_VRAM_THRESHOLD; } +static void xe_pm_wake_rebind_workers(struct xe_device *xe) +{ + struct xe_vm *vm, *next; + + mutex_lock(&xe->rebind_resume_lock); + list_for_each_entry_safe(vm, next, &xe->rebind_resume_list, + preempt.pm_activate_link) { + list_del_init(&vm->preempt.pm_activate_link); + xe_vm_resume_rebind_worker(vm); + } + mutex_unlock(&xe->rebind_resume_lock); +} + static int xe_pm_notifier_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -299,6 +313,7 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + reinit_completion(&xe->pm_block); xe_pm_runtime_get(xe); err = xe_bo_evict_all_user(xe); if (err) @@ -315,6 +330,8 @@ static int xe_pm_notifier_callback(struct notifier_block *nb, break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: + complete_all(&xe->pm_block); + xe_pm_wake_rebind_workers(xe); xe_bo_notifier_unprepare_all_pinned(xe); xe_pm_runtime_put(xe); break; @@ -341,6 +358,14 @@ int xe_pm_init(struct xe_device *xe) if (err) return err; + err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock); + if (err) + goto err_unregister; + + init_completion(&xe->pm_block); + complete_all(&xe->pm_block); + INIT_LIST_HEAD(&xe->rebind_resume_list); + /* For now suspend/resume is only allowed with GuC */ if (!xe_device_uc_enabled(xe)) return 0; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index d60c4b115304..dc4f61e56579 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind, &vm->rebind_list); + if (!try_wait_for_completion(&vm->xe->pm_block)) + return -EAGAIN; + ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false); if (ret) return ret; @@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues); } +static bool vm_suspend_rebind_worker(struct xe_vm *vm) +{ + struct xe_device *xe = vm->xe; + bool ret = false; + + mutex_lock(&xe->rebind_resume_lock); + if (!try_wait_for_completion(&vm->xe->pm_block)) { + ret = true; + list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list); + } + mutex_unlock(&xe->rebind_resume_lock); + + return ret; +} + +/** + * xe_vm_resume_rebind_worker() - Resume the rebind worker. + * @vm: The vm whose preempt worker to resume. + * + * Resume a preempt worker that was previously suspended by + * vm_suspend_rebind_worker(). + */ +void xe_vm_resume_rebind_worker(struct xe_vm *vm) +{ + queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work); +} + static void preempt_rebind_work_func(struct work_struct *w) { struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); @@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w) } retry: + if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) { + up_write(&vm->lock); + return; + } + if (xe_vm_userptr_check_repin(vm)) { err = xe_vm_userptr_pin(vm); if (err) @@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef) if (flags & XE_VM_FLAG_LR_MODE) { INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); xe_pm_runtime_get_noresume(xe); + INIT_LIST_HEAD(&vm->preempt.pm_activate_link); } if (flags & XE_VM_FLAG_FAULT_MODE) { @@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm) xe_assert(xe, !vm->preempt.num_exec_queues); xe_vm_close(vm); - if (xe_vm_in_preempt_fence_mode(vm)) + if (xe_vm_in_preempt_fence_mode(vm)) { + mutex_lock(&xe->rebind_resume_lock); + list_del_init(&vm->preempt.pm_activate_link); + mutex_unlock(&xe->rebind_resume_lock); flush_work(&vm->preempt.rebind_work); + } if (xe_vm_in_fault_mode(vm)) xe_svm_close(vm); diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h index 2ecb417c19a2..82b112795807 100644 --- a/drivers/gpu/drm/xe/xe_vm.h +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo, struct xe_exec_queue *q, u64 addr, enum xe_cache_level cache_lvl); +void xe_vm_resume_rebind_worker(struct xe_vm *vm); + /** * xe_vm_resv() - Return's the vm's reservation object * @vm: The vm diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 8a07feef503b..6058cf739388 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -293,6 +293,11 @@ struct xe_vm { * BOs */ struct work_struct rebind_work; + /** + * @preempt.pm_activate_link: Link to list of rebind workers to be + * kicked on resume. + */ + struct list_head pm_activate_link; } preempt; /** @um: unified memory state */ From fd99415ec8a80866e5e19f7835876e7b4f294946 Mon Sep 17 00:00:00 2001 From: Julia Filipchuk Date: Wed, 3 Sep 2025 12:00:38 -0700 Subject: [PATCH 203/450] drm/xe: Extend Wa_13011645652 to PTL-H, WCL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand workaround to additional graphics architectures. Cc: Vinay Belgaumkar Cc: Stuart Summers Cc: Daniele Ceraolo Spurio Cc: Lucas De Marchi Cc: Thomas Hellström Cc: Rodrigo Vivi Cc: intel-xe@lists.freedesktop.org Cc: # v6.17+ Signed-off-by: Julia Filipchuk Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250903190122.1028373-2-julia.filipchuk@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 6fc957185e1691bb6dfa4193698a229db537c2a2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_wa_oob.rules | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index e990f20eccfe..710f4423726c 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -30,7 +30,8 @@ 16022287689 GRAPHICS_VERSION(2001) GRAPHICS_VERSION(2004) 13011645652 GRAPHICS_VERSION(2004) - GRAPHICS_VERSION(3001) + GRAPHICS_VERSION_RANGE(3000, 3001) + GRAPHICS_VERSION(3003) 14022293748 GRAPHICS_VERSION_RANGE(2001, 2002) GRAPHICS_VERSION(2004) GRAPHICS_VERSION_RANGE(3000, 3001) From 503f1c72c31bbee21e669a08cf65c49e96d42755 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 27 Aug 2025 14:17:36 -0700 Subject: [PATCH 204/450] i40e: fix Jumbo Frame support after iPXE boot The i40e hardware has multiple hardware settings which define the Maximum Frame Size (MFS) of the physical port. The firmware has an AdminQ command (0x0603) to configure the MFS, but the i40e Linux driver never issues this command. In most cases this is no problem, as the NVM default value has the device configured for its maximum value of 9728. Unfortunately, recent versions of the iPXE intelxl driver now issue the 0x0603 Set Mac Config command, modifying the MFS and reducing it from its default value of 9728. This occurred as part of iPXE commit 6871a7de705b ("[intelxl] Use admin queue to set port MAC address and maximum frame size"), a prerequisite change for supporting the E800 series hardware in iPXE. Both the E700 and E800 firmware support the AdminQ command, and the iPXE code shares much of the logic between the two device drivers. The ice E800 Linux driver already issues the 0x0603 Set Mac Config command early during probe, and is thus unaffected by the iPXE change. Since commit 3a2c6ced90e1 ("i40e: Add a check to see if MFS is set"), the i40e driver does check the I40E_PRTGL_SAH register, but it only logs a warning message if its value is below the 9728 default. This register also only covers received packets and not transmitted packets. A warning can inform system administrators, but does not correct the issue. No interactions from userspace cause the driver to write to PRTGL_SAH or issue the 0x0603 AdminQ command. Only a GLOBR reset will restore the value to its default value. There is no obvious method to trigger a GLOBR reset from user space. To fix this, introduce the i40e_aq_set_mac_config() function, similar to the one from the ice driver. Call this during early probe to ensure that the device configuration matches driver expectation. Unlike E800, the E700 firmware also has a bit to control whether the MAC should append CRC data. It is on by default, but setting a 0 to this bit would disable CRC. The i40e implementation must set this bit to ensure CRC will be appended by the MAC. In addition to the AQ command, instead of just checking the I40E_PRTGL_SAH register, update its value to the 9728 default and write it back. This ensures that the hardware is in the expected state, regardless of whether the iPXE (or any other early boot driver) has modified this state. This is a better user experience, as we now fix the issues with larger MTU instead of merely warning. It also aligns with the way the ice E800 series driver works. A final note: The Fixes tag provided here is not strictly accurate. The issue occurs as a result of an external entity (the iPXE intelxl driver), and this is not a regression specifically caused by the mentioned change. However, I believe the original change to just warn about PRTGL_SAH being too low was an insufficient fix. Fixes: 3a2c6ced90e1 ("i40e: Add a check to see if MFS is set") Link: https://github.com/ipxe/ipxe/commit/6871a7de705b6f6a4046f0d19da9bcd689c3bc8e Signed-off-by: Jacob Keller Signed-off-by: Aleksandr Loktionov Reviewed-by: Michal Schmidt Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/i40e/i40e_adminq_cmd.h | 1 + drivers/net/ethernet/intel/i40e/i40e_common.c | 34 +++++++++++++++++++ drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++++---- .../net/ethernet/intel/i40e/i40e_prototype.h | 2 ++ 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 76d872b91a38..cc02a85ad42b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1561,6 +1561,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config); struct i40e_aq_set_mac_config { __le16 max_frame_size; u8 params; +#define I40E_AQ_SET_MAC_CONFIG_CRC_EN BIT(2) u8 tx_timer_priority; /* bitmap */ __le16 tx_timer_value; __le16 fc_refresh_threshold; diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 270e7e8cf9cf..59f5c1e810eb 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1189,6 +1189,40 @@ int i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures, return status; } +/** + * i40e_aq_set_mac_config - Configure MAC settings + * @hw: pointer to the hw struct + * @max_frame_size: Maximum Frame Size to be supported by the port + * @cmd_details: pointer to command details structure or NULL + * + * Set MAC configuration (0x0603). Note that max_frame_size must be greater + * than zero. + * + * Return: 0 on success, or a negative error code on failure. + */ +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details) +{ + struct i40e_aq_set_mac_config *cmd; + struct libie_aq_desc desc; + + cmd = libie_aq_raw(&desc); + + if (max_frame_size == 0) + return -EINVAL; + + i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_mac_config); + + cmd->max_frame_size = cpu_to_le16(max_frame_size); + cmd->params = I40E_AQ_SET_MAC_CONFIG_CRC_EN; + +#define I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD 0x7FFF + cmd->fc_refresh_threshold = + cpu_to_le16(I40E_AQ_SET_MAC_CONFIG_FC_DEFAULT_THRESHOLD); + + return i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details); +} + /** * i40e_aq_clear_pxe_mode * @hw: pointer to the hw struct diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index dd21d93d39dd..b14019d44b58 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16045,13 +16045,17 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_dbg(&pf->pdev->dev, "get supported phy types ret = %pe last_status = %s\n", ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); - /* make sure the MFS hasn't been set lower than the default */ #define MAX_FRAME_SIZE_DEFAULT 0x2600 - val = FIELD_GET(I40E_PRTGL_SAH_MFS_MASK, - rd32(&pf->hw, I40E_PRTGL_SAH)); - if (val < MAX_FRAME_SIZE_DEFAULT) - dev_warn(&pdev->dev, "MFS for port %x (%d) has been set below the default (%d)\n", - pf->hw.port, val, MAX_FRAME_SIZE_DEFAULT); + + err = i40e_aq_set_mac_config(hw, MAX_FRAME_SIZE_DEFAULT, NULL); + if (err) + dev_warn(&pdev->dev, "set mac config ret = %pe last_status = %s\n", + ERR_PTR(err), libie_aq_str(pf->hw.aq.asq_last_status)); + + /* Make sure the MFS is set to the expected value */ + val = rd32(hw, I40E_PRTGL_SAH); + FIELD_MODIFY(I40E_PRTGL_SAH_MFS_MASK, &val, MAX_FRAME_SIZE_DEFAULT); + wr32(hw, I40E_PRTGL_SAH, val); /* Add a filter to drop all Flow control frames from any VSI from being * transmitted. By doing so we stop a malicious VF from sending out diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h index aef5de53ce3b..26bb7bffe361 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h +++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h @@ -98,6 +98,8 @@ int i40e_aq_set_mac_loopback(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_phy_int_mask(struct i40e_hw *hw, u16 mask, struct i40e_asq_cmd_details *cmd_details); +int i40e_aq_set_mac_config(struct i40e_hw *hw, u16 max_frame_size, + struct i40e_asq_cmd_details *cmd_details); int i40e_aq_clear_pxe_mode(struct i40e_hw *hw, struct i40e_asq_cmd_details *cmd_details); int i40e_aq_set_link_restart_an(struct i40e_hw *hw, From 7838fb5f119191403560eca2e23613380c0e425e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 4 Sep 2025 12:35:05 -0400 Subject: [PATCH 205/450] drm/amdgpu: fix a memory leak in fence cleanup when unloading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b61badd20b44 ("drm/amdgpu: fix usage slab after free") reordered when amdgpu_fence_driver_sw_fini() was called after that patch, amdgpu_fence_driver_sw_fini() effectively became a no-op as the sched entities we never freed because the ring pointers were already set to NULL. Remove the NULL setting. Reported-by: Lin.Cao Cc: Vitaly Prosyak Cc: Christian König Fixes: b61badd20b44 ("drm/amdgpu: fix usage slab after free") Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit a525fa37aac36c4591cc8b07ae8957862415fbd5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 6379bb25bf5c..486c3646710c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -421,8 +421,6 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) dma_fence_put(ring->vmid_wait); ring->vmid_wait = NULL; ring->me = 0; - - ring->adev->rings[ring->idx] = NULL; } /** From 1d66c3f2b8c0b5c51f3f4fe29b362c9851190c5a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 3 Sep 2025 09:11:12 -0400 Subject: [PATCH 206/450] drm/amd/display: use udelay rather than fsleep This function can be called from an atomic context so we can't use fsleep(). Fixes: 01f60348d8fb ("drm/amd/display: Fix 'failed to blank crtc!'") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4549 Cc: Wen Chen Cc: Fangzhi Zuo Cc: Nicholas Kazlauskas Cc: Harry Wentland Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher (cherry picked from commit 27e4dc2c0543fd1808cc52bd888ee1e0533c4a2e) --- drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 3207addbd4eb..b7c2d3095b25 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -955,7 +955,7 @@ enum dc_status dcn20_enable_stream_timing( return DC_ERROR_UNEXPECTED; } - fsleep(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); + udelay(stream->timing.v_total * (stream->timing.h_total * 10000u / stream->timing.pix_clk_100hz)); params.vertical_total_min = stream->adjust.v_total_min; params.vertical_total_max = stream->adjust.v_total_max; From 857ccfc19f9be1269716f3d681650c1bd149a656 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Wed, 3 Sep 2025 16:00:24 -0400 Subject: [PATCH 207/450] drm/amd/amdgpu: Declare isp firmware binary file Declare isp firmware file isp_4_1_1.bin required by isp4.1.1 device. Suggested-by: Alexey Zagorodnikov Reviewed-by: Mario Limonciello Signed-off-by: Pratap Nirujogi Signed-off-by: Alex Deucher (cherry picked from commit d97b74a833eba1f4f69f67198fd98ef036c0e5f9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c index a887df520414..4258d3e0b706 100644 --- a/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c +++ b/drivers/gpu/drm/amd/amdgpu/isp_v4_1_1.c @@ -29,6 +29,8 @@ #include "amdgpu.h" #include "isp_v4_1_1.h" +MODULE_FIRMWARE("amdgpu/isp_4_1_1.bin"); + #define ISP_PERFORMANCE_STATE_LOW 0 #define ISP_PERFORMANCE_STATE_HIGH 1 From 2b10cb58d7a3fd621ec9b2ba765a092e562ef998 Mon Sep 17 00:00:00 2001 From: David Rosca Date: Mon, 18 Aug 2025 09:06:58 +0200 Subject: [PATCH 208/450] drm/amdgpu/vcn4: Fix IB parsing with multiple engine info packages There can be multiple engine info packages in one IB and the first one may be common engine, not decode/encode. We need to parse the entire IB instead of stopping after finding first engine info. Signed-off-by: David Rosca Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit dc8f9f0f45166a6b37864e7a031c726981d6e5fc) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 48 +++++++++++---------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 1924e075b66f..311fb595bd69 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1907,22 +1907,16 @@ static int vcn_v4_0_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job, #define RADEON_VCN_ENGINE_TYPE_ENCODE (0x00000002) #define RADEON_VCN_ENGINE_TYPE_DECODE (0x00000003) - #define RADEON_VCN_ENGINE_INFO (0x30000001) -#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET 16 - #define RENCODE_ENCODE_STANDARD_AV1 2 #define RENCODE_IB_PARAM_SESSION_INIT 0x00000003 -#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET 64 -/* return the offset in ib if id is found, -1 otherwise - * to speed up the searching we only search upto max_offset - */ -static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset) +/* return the offset in ib if id is found, -1 otherwise */ +static int vcn_v4_0_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int start) { int i; - for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) { + for (i = start; i < ib->length_dw && ib->ptr[i] >= 8; i += ib->ptr[i] / 4) { if (ib->ptr[i + 1] == id) return i; } @@ -1937,33 +1931,29 @@ static int vcn_v4_0_ring_patch_cs_in_place(struct amdgpu_cs_parser *p, struct amdgpu_vcn_decode_buffer *decode_buffer; uint64_t addr; uint32_t val; - int idx; + int idx = 0, sidx; /* The first instance can decode anything */ if (!ring->me) return 0; - /* RADEON_VCN_ENGINE_INFO is at the top of ib block */ - idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, - RADEON_VCN_ENGINE_INFO_MAX_OFFSET); - if (idx < 0) /* engine info is missing */ - return 0; + while ((idx = vcn_v4_0_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO, idx)) >= 0) { + val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ + if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { + decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; - val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */ - if (val == RADEON_VCN_ENGINE_TYPE_DECODE) { - decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6]; + if (!(decode_buffer->valid_buf_flag & 0x1)) + return 0; - if (!(decode_buffer->valid_buf_flag & 0x1)) - return 0; - - addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | - decode_buffer->msg_buffer_address_lo; - return vcn_v4_0_dec_msg(p, job, addr); - } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { - idx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, - RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET); - if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1) - return vcn_v4_0_limit_sched(p, job); + addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 | + decode_buffer->msg_buffer_address_lo; + return vcn_v4_0_dec_msg(p, job, addr); + } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) { + sidx = vcn_v4_0_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT, idx); + if (sidx >= 0 && ib->ptr[sidx + 2] == RENCODE_ENCODE_STANDARD_AV1) + return vcn_v4_0_limit_sched(p, job); + } + idx += ib->ptr[idx] / 4; } return 0; } From 3318f2d20ce48849855df5e190813826d0bc3653 Mon Sep 17 00:00:00 2001 From: David Rosca Date: Mon, 18 Aug 2025 09:18:37 +0200 Subject: [PATCH 209/450] drm/amdgpu/vcn: Allow limiting ctx to instance 0 for AV1 at any time There is no reason to require this to happen on first submitted IB only. We need to wait for the queue to be idle, but it can be done at any time (including when there are multiple video sessions active). Signed-off-by: David Rosca Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 8908fdce0634a623404e9923ed2f536101a39db5) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 12 ++++++++---- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 4b8f4407047f..2811226b0ea5 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -1888,15 +1888,19 @@ static int vcn_v3_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_DEC] [AMDGPU_RING_PRIO_DEFAULT].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 311fb595bd69..706f3b2f484f 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1808,15 +1808,19 @@ static int vcn_v4_0_limit_sched(struct amdgpu_cs_parser *p, struct amdgpu_job *job) { struct drm_gpu_scheduler **scheds; - - /* The create msg must be in the first IB submitted */ - if (atomic_read(&job->base.entity->fence_seq)) - return -EINVAL; + struct dma_fence *fence; /* if VCN0 is harvested, we can't support AV1 */ if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0) return -EINVAL; + /* wait for all jobs to finish before switching to instance 0 */ + fence = amdgpu_ctx_get_fence(p->ctx, job->base.entity, ~0ull); + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC] [AMDGPU_RING_PRIO_0].sched; drm_sched_entity_modify_sched(job->base.entity, scheds, 1); From f9bb6ffa7f5ad0f8ee0f53fc4a10655872ee4a14 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:56 +0200 Subject: [PATCH 210/450] bpf: Fix out-of-bounds dynptr write in bpf_crypto_crypt Stanislav reported that in bpf_crypto_crypt() the destination dynptr's size is not validated to be at least as large as the source dynptr's size before calling into the crypto backend with 'len = src_len'. This can result in an OOB write when the destination is smaller than the source. Concretely, in mentioned function, psrc and pdst are both linear buffers fetched from each dynptr: psrc = __bpf_dynptr_data(src, src_len); [...] pdst = __bpf_dynptr_data_rw(dst, dst_len); [...] err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); The crypto backend expects pdst to be large enough with a src_len length that can be written. Add an additional src_len > dst_len check and bail out if it's the case. Note that these kfuncs are accessible under root privileges only. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: Stanislav Fort Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) From 3aa9b9a165d5e9afc7d8b5dbcd508810c05c8e89 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:57 +0200 Subject: [PATCH 211/450] selftests/bpf: Extend crypto_sanity selftest with invalid dst buffer Small cleanup and test extension to probe the bpf_crypto_{encrypt,decrypt}() kfunc when a bad dst buffer is passed in to assert that an error is returned. Also, encrypt_sanity() and skb_crypto_setup() were explicit to set the global status variable to zero before any test, so do the same for decrypt_sanity(). Do not explicitly zero the on-stack err before bpf_crypto_ctx_create() given the kfunc is expected to do it internally for the success case. Before kernel fix: # ./vmtest.sh -- ./test_progs -t crypto [...] [ 1.531200] bpf_testmod: loading out-of-tree module taints kernel. [ 1.533388] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #87/1 crypto_basic/crypto_release:OK #87/2 crypto_basic/crypto_acquire:OK #87 crypto_basic:OK test_crypto_sanity:PASS:skel open 0 nsec test_crypto_sanity:PASS:ip netns add crypto_sanity_ns 0 nsec test_crypto_sanity:PASS:ip -net crypto_sanity_ns -6 addr add face::1/128 dev lo nodad 0 nsec test_crypto_sanity:PASS:ip -net crypto_sanity_ns link set dev lo up 0 nsec test_crypto_sanity:PASS:open_netns 0 nsec test_crypto_sanity:PASS:AF_ALG init fail 0 nsec test_crypto_sanity:PASS:if_nametoindex lo 0 nsec test_crypto_sanity:PASS:skb_crypto_setup fd 0 nsec test_crypto_sanity:PASS:skb_crypto_setup 0 nsec test_crypto_sanity:PASS:skb_crypto_setup retval 0 nsec test_crypto_sanity:PASS:skb_crypto_setup status 0 nsec test_crypto_sanity:PASS:create qdisc hook 0 nsec test_crypto_sanity:PASS:make_sockaddr 0 nsec test_crypto_sanity:PASS:attach encrypt filter 0 nsec test_crypto_sanity:PASS:encrypt socket 0 nsec test_crypto_sanity:PASS:encrypt send 0 nsec test_crypto_sanity:FAIL:encrypt status unexpected error: -5 (errno 95) #88 crypto_sanity:FAIL Summary: 1/2 PASSED, 0 SKIPPED, 1 FAILED After kernel fix: # ./vmtest.sh -- ./test_progs -t crypto [...] [ 1.540963] bpf_testmod: loading out-of-tree module taints kernel. [ 1.542404] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #87/1 crypto_basic/crypto_release:OK #87/2 crypto_basic/crypto_acquire:OK #87 crypto_basic:OK #88 crypto_sanity:OK Summary: 2/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/crypto_sanity.c | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/crypto_sanity.c b/tools/testing/selftests/bpf/progs/crypto_sanity.c index 645be6cddf36..dfd8a258f14a 100644 --- a/tools/testing/selftests/bpf/progs/crypto_sanity.c +++ b/tools/testing/selftests/bpf/progs/crypto_sanity.c @@ -14,7 +14,7 @@ unsigned char key[256] = {}; u16 udp_test_port = 7777; u32 authsize, key_len; char algo[128] = {}; -char dst[16] = {}; +char dst[16] = {}, dst_bad[8] = {}; int status; static int skb_dynptr_validate(struct __sk_buff *skb, struct bpf_dynptr *psrc) @@ -59,10 +59,9 @@ int skb_crypto_setup(void *ctx) .authsize = authsize, }; struct bpf_crypto_ctx *cctx; - int err = 0; + int err; status = 0; - if (key_len > 256) { status = -EINVAL; return 0; @@ -70,8 +69,8 @@ int skb_crypto_setup(void *ctx) __builtin_memcpy(¶ms.algo, algo, sizeof(algo)); __builtin_memcpy(¶ms.key, key, sizeof(key)); - cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); + cctx = bpf_crypto_ctx_create(¶ms, sizeof(params), &err); if (!cctx) { status = err; return 0; @@ -80,7 +79,6 @@ int skb_crypto_setup(void *ctx) err = crypto_ctx_insert(cctx); if (err && err != -EEXIST) status = err; - return 0; } @@ -92,6 +90,7 @@ int decrypt_sanity(struct __sk_buff *skb) struct bpf_dynptr psrc, pdst; int err; + status = 0; err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -110,13 +109,23 @@ int decrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_decrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } @@ -129,7 +138,6 @@ int encrypt_sanity(struct __sk_buff *skb) int err; status = 0; - err = skb_dynptr_validate(skb, &psrc); if (err < 0) { status = err; @@ -148,13 +156,23 @@ int encrypt_sanity(struct __sk_buff *skb) return TC_ACT_SHOT; } - /* dst is a global variable to make testing part easier to check. In real - * production code, a percpu map should be used to store the result. + /* Check also bad case where the dst buffer is smaller than the + * skb's linear section. + */ + bpf_dynptr_from_mem(dst_bad, sizeof(dst_bad), 0, &pdst); + status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); + if (!status) + status = -EIO; + if (status != -EINVAL) + goto err; + + /* dst is a global variable to make testing part easier to check. + * In real production code, a percpu map should be used to store + * the result. */ bpf_dynptr_from_mem(dst, sizeof(dst), 0, &pdst); - status = bpf_crypto_encrypt(ctx, &psrc, &pdst, NULL); - +err: return TC_ACT_SHOT; } From 7edfc024708258d75f65fadffd7e5f6ac46810b6 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:31:58 +0800 Subject: [PATCH 212/450] bpf: Fix bpf_strnstr() to handle suffix match cases better bpf_strnstr() should not treat the ending '\0' of s2 as a matching character if the parameter 'len' equal to s2 string length, for example: 1. bpf_strnstr("openat", "open", 4) = -ENOENT 2. bpf_strnstr("openat", "open", 5) = 0 This patch makes (1) return 0, fix just the `len == strlen(s2)` case. And fix a more general case when s2 is a suffix of the first len characters of s1. Fixes: e91370550f1f ("bpf: Add kfuncs for read-only string operations") Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_17DC57B9D16BC443837021BEACE84B7C1507@qq.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..b9b0c5fe33f6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3664,10 +3664,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; From 5d40c038c879eeb910039adeaf6102e1c4dda807 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 29 Aug 2025 04:53:57 +0200 Subject: [PATCH 213/450] selftests/bpf: Fix "expression result unused" warnings with icecc icecc is a compiler wrapper that distributes compile jobs over a build farm [1]. It works by sending toolchain binaries and preprocessed source code to remote machines. Unfortunately using it with BPF selftests causes build failures due to a clang bug [2]. The problem is that clang suppresses the -Wunused-value warning if the unused expression comes from a macro expansion. Since icecc compiles preprocessed source code, this information is not available. This leads to -Wunused-value false positives. obj_new_no_struct() and obj_new_acq() use the bpf_obj_new() macro and discard the result. arena_spin_lock_slowpath() uses two macros that produce values and ignores the results. Add (void) casts to explicitly indicate that this is intentional and suppress the warning. An alternative solution is to change the macros to not produce values. This would work today for the arena_spin_lock_slowpath() issue, but in the future there may appear users who need them. Another potential solution is to replace these macros with functions. Unfortunately this would not work, because these macros work with unknown types and control flow. [1] https://github.com/icecc/icecream [2] https://github.com/llvm/llvm-project/issues/142614 Signed-off-by: Ilya Leoshkevich Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250829030017.102615-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h | 4 ++-- tools/testing/selftests/bpf/progs/linked_list_fail.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h index d67466c1ff77..f90531cf3ee5 100644 --- a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -302,7 +302,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val * barriers. */ if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); /* * take ownership and clear the pending bit. @@ -380,7 +380,7 @@ int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); /* * While waiting for the MCS lock, the next pointer may have diff --git a/tools/testing/selftests/bpf/progs/linked_list_fail.c b/tools/testing/selftests/bpf/progs/linked_list_fail.c index 6438982b928b..ddd26d1a083f 100644 --- a/tools/testing/selftests/bpf/progs/linked_list_fail.c +++ b/tools/testing/selftests/bpf/progs/linked_list_fail.c @@ -226,8 +226,7 @@ int obj_new_no_composite(void *ctx) SEC("?tc") int obj_new_no_struct(void *ctx) { - - bpf_obj_new(union { int data; unsigned udata; }); + (void)bpf_obj_new(union { int data; unsigned udata; }); return 0; } @@ -252,7 +251,7 @@ int new_null_ret(void *ctx) SEC("?tc") int obj_new_acq(void *ctx) { - bpf_obj_new(struct foo); + (void)bpf_obj_new(struct foo); return 0; } From 6624fb2f3382271953f951d46f2ea30415a0917e Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:32:13 +0800 Subject: [PATCH 214/450] selftests/bpf: Add tests for bpf_strnstr Add tests for bpf_strnstr(): bpf_strnstr("", "", 0) = 0 bpf_strnstr("hello world", "hello", 5) = 0 bpf_strnstr(str, "hello", 4) = -ENOENT bpf_strnstr("", "a", 0) = -ENOENT Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_2ED218F8082565C95D86A804BDDA8DBA200A@qq.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index 46697f381878..a47690174e0e 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -30,8 +30,12 @@ __test(2) int test_strcspn(void *ctx) { return bpf_strcspn(str, "lo"); } __test(6) int test_strstr_found(void *ctx) { return bpf_strstr(str, "world"); } __test(-ENOENT) int test_strstr_notfound(void *ctx) { return bpf_strstr(str, "hi"); } __test(0) int test_strstr_empty(void *ctx) { return bpf_strstr(str, ""); } -__test(0) int test_strnstr_found(void *ctx) { return bpf_strnstr(str, "hello", 6); } -__test(-ENOENT) int test_strnstr_notfound(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(0) int test_strnstr_found1(void *ctx) { return bpf_strnstr("", "", 0); } +__test(0) int test_strnstr_found2(void *ctx) { return bpf_strnstr(str, "hello", 5); } +__test(0) int test_strnstr_found3(void *ctx) { return bpf_strnstr(str, "hello", 6); } +__test(-ENOENT) int test_strnstr_notfound1(void *ctx) { return bpf_strnstr(str, "hi", 10); } +__test(-ENOENT) int test_strnstr_notfound2(void *ctx) { return bpf_strnstr(str, "hello", 4); } +__test(-ENOENT) int test_strnstr_notfound3(void *ctx) { return bpf_strnstr("", "a", 0); } __test(0) int test_strnstr_empty(void *ctx) { return bpf_strnstr(str, "", 1); } char _license[] SEC("license") = "GPL"; From 6c6f5c19e67c89e974ef0dd8601804eaa4ae868d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 9 Sep 2025 10:16:38 -0700 Subject: [PATCH 215/450] bpf: Update the list of BPF selftests maintainers Unfortunately Mykola won't participate in BPF selftests maintenance anymore. Remove the entry on his behalf. Acked-by: Mykola Lysenko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909171638.2417272-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..6056ad6f1afa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4682,7 +4682,6 @@ F: security/bpf/ BPF [SELFTESTS] (Test Runners & Infrastructure) M: Andrii Nakryiko M: Eduard Zingerman -R: Mykola Lysenko L: bpf@vger.kernel.org S: Maintained F: tools/testing/selftests/bpf/ From 30f241fcf52aaaef7ac16e66530faa11be78a865 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 4 Sep 2025 21:49:07 +0200 Subject: [PATCH 216/450] xsk: Fix immature cq descriptor production Eryk reported an issue that I have put under Closes: tag, related to umem addrs being prematurely produced onto pool's completion queue. Let us make the skb's destructor responsible for producing all addrs that given skb used. Commit from fixes tag introduced the buggy behavior, it was not broken from day 1, but rather when xsk multi-buffer got introduced. In order to mitigate performance impact as much as possible, mimic the linear and frag parts within skb by storing the first address from XSK descriptor at sk_buff::destructor_arg. For fragments, store them at ::cb via list. The nodes that will go onto list will be allocated via kmem_cache. xsk_destruct_skb() will consume address stored at ::destructor_arg and optionally go through list from ::cb, if count of descriptors associated with this particular skb is bigger than 1. Previous approach where whole array for storing UMEM addresses from XSK descriptors was pre-allocated during first fragment processing yielded too big performance regression for 64b traffic. In current approach impact is much reduced on my tests and for jumbo frames I observed traffic being slower by at most 9%. Magnus suggested to have this way of processing special cased for XDP_SHARED_UMEM, so we would identify this during bind and set different hooks for 'backpressure mechanism' on CQ and for skb destructor, but given that results looked promising on my side I decided to have a single data path for XSK generic Tx. I suppose other auxiliary stuff would have to land as well in order to make it work. Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path") Reported-by: Eryk Kubanski Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/ Acked-by: Stanislav Fomichev Signed-off-by: Maciej Fijalkowski Tested-by: Jason Xing Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20250904194907.2342177-1-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 113 ++++++++++++++++++++++++++++++++++++++------ net/xdp/xsk_queue.h | 12 +++++ 2 files changed, 111 insertions(+), 14 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1..72e34bd2d925 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,20 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addr_node { + u64 addr; + struct list_head addr_node; +}; + +struct xsk_addr_head { + u32 num_descs; + struct list_head addrs_list; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + +#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) + void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); + ret = xskq_prod_reserve(pool->cq); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) { + struct xsk_addr_node *pos, *tmp; + u32 descs_processed = 0; unsigned long flags; + u32 idx; spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); + idx = xskq_get_prod(pool->cq); + + xskq_prod_write_addr(pool->cq, idx, + (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); + descs_processed++; + + if (unlikely(XSKCB(skb)->num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + pos->addr); + descs_processed++; + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } + xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); } @@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) spin_unlock_irqrestore(&pool->cq_lock, flags); } +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + XSKCB(skb)->num_descs++; +} + static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + return XSKCB(skb)->num_descs; } static void xsk_destruct_skb(struct sk_buff *skb) @@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - - skb_shinfo(skb)->destructor_arg = (void *)num; + BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); + INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + XSKCB(skb)->num_descs = 0; + skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addr_node *pos, *tmp; + + if (unlikely(num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; + struct xsk_addr_node *xsk_addr; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; @@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + + xsk_set_destructor_arg(skb, desc->addr); + } else { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } addr = desc->addr; @@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + xsk_set_destructor_arg(skb, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addr_node *xsk_addr; struct page *page; u8 *vaddr; @@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) { + __free_page(page); + err = -ENOMEM; + goto free_err; + } + vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } if (first_frag && desc->options & XDP_TX_METADATA) { @@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + xsk_inc_num_desc(skb); return skb; @@ -769,7 +844,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (err == -EOVERFLOW) { /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); + xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { @@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; @@ -1815,8 +1890,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addr_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6..f16f390370dc 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { From 0d80e7f951be1bdd08d328fd87694be0d6e8aaa8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Sep 2025 18:49:59 +0000 Subject: [PATCH 217/450] rqspinlock: Choose trylock fallback for NMI waiters Currently, out of all 3 types of waiters in the rqspinlock slow path (i.e., pending bit waiter, wait queue head waiter, and wait queue non-head waiter), only the pending bit waiter and wait queue head waiters apply deadlock checks and a timeout on their waiting loop. The assumption here was that the wait queue head's forward progress would be sufficient to identify cases where the lock owner or pending bit waiter is stuck, and non-head waiters relying on the head waiter would prove to be sufficient for their own forward progress. However, the head waiter itself can be preempted by a non-head waiter for the same lock (AA) or a different lock (ABBA) in a manner that impedes its forward progress. In such a case, non-head waiters not performing deadlock and timeout checks becomes insufficient, and the system can enter a state of lockup. This is typically not a concern with non-NMI lock acquisitions, as lock holders which in run in different contexts (IRQ, non-IRQ) use "irqsave" variants of the lock APIs, which naturally excludes such lock holders from preempting one another on the same CPU. It might seem likely that a similar case may occur for rqspinlock when programs are attached to contention tracepoints (begin, end), however, these tracepoints either precede the enqueue into the wait queue, or succeed it, therefore cannot be used to preempt a head waiter's waiting loop. We must still be careful against nested kprobe and fentry programs that may attach to the middle of the head's waiting loop to stall forward progress and invoke another rqspinlock acquisition that proceeds as a non-head waiter. To this end, drop CC_FLAGS_FTRACE from the rqspinlock.o object file. For now, this issue is resolved by falling back to a repeated trylock on the lock word from NMI context, while performing the deadlock checks to break out early in case forward progress is impossible, and use the timeout as a final fallback. A more involved fix to terminate the queue when such a condition occurs will be made as a follow up. A selftest to stress this aspect of nested NMI/non-NMI locking attempts will be added in a subsequent patch to the bpf-next tree when this fix lands and trees are synchronized. Reported-by: Josef Bacik Fixes: 164c246571e9 ("rqspinlock: Protect waiters in queue from stalls") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250909184959.3509085-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 1 + kernel/bpf/rqspinlock.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a24664..f6cf8c2af5f7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..a00561b1d3e5 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { From df0cb5cb50bd54d3cd4d0d83417ceec6a66404aa Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 9 Sep 2025 22:46:14 +0800 Subject: [PATCH 218/450] bpf: Allow fall back to interpreter for programs with stack size <= 512 OpenWRT users reported regression on ARMv6 devices after updating to latest HEAD, where tcpdump filter: tcpdump "not ether host 3c37121a2b3c and not ether host 184ecbca2a3a \ and not ether host 14130b4d3f47 and not ether host f0f61cf440b7 \ and not ether host a84b4dedf471 and not ether host d022be17e1d7 \ and not ether host 5c497967208b and not ether host 706655784d5b" fails with warning: "Kernel filter failed: No error information" when using config: # CONFIG_BPF_JIT_ALWAYS_ON is not set CONFIG_BPF_JIT_DEFAULT_ON=y The issue arises because commits: 1. "bpf: Fix array bounds error with may_goto" changed default runtime to __bpf_prog_ret0_warn when jit_requested = 1 2. "bpf: Avoid __bpf_prog_ret0_warn when jit fails" returns error when jit_requested = 1 but jit fails This change restores interpreter fallback capability for BPF programs with stack size <= 512 bytes when jit fails. Reported-by: Felix Fietkau Closes: https://lore.kernel.org/bpf/2e267b4b-0540-45d8-9310-e127bf95fc63@nbd.name/ Fixes: 6ebc5030e0c5 ("bpf: Fix array bounds error with may_goto") Signed-off-by: KaFai Wan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909144614.2991253-1-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f8ac77d08ca7..e4568d44e827 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during From 6d78b4473cdb08b74662355a9e8510bde09c511e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 9 Sep 2025 09:52:20 +0000 Subject: [PATCH 219/450] bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init() Currently, calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various locking issues; see the following stack trace (edited for style) as one example: ... [10.011566] do_raw_spin_lock.cold [10.011570] try_to_wake_up (5) double-acquiring the same [10.011575] kick_pool rq_lock, causing a hardlockup [10.011579] __queue_work [10.011582] queue_work_on [10.011585] kernfs_notify [10.011589] cgroup_file_notify [10.011593] try_charge_memcg (4) memcg accounting raises an [10.011597] obj_cgroup_charge_pages MEMCG_MAX event [10.011599] obj_cgroup_charge_account [10.011600] __memcg_slab_post_alloc_hook [10.011603] __kmalloc_node_noprof ... [10.011611] bpf_map_kmalloc_node [10.011612] __bpf_async_init [10.011615] bpf_timer_init (3) BPF calls bpf_timer_init() [10.011617] bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable [10.011619] bpf__sched_ext_ops_runnable [10.011620] enqueue_task_scx (2) BPF runs with rq_lock held [10.011622] enqueue_task [10.011626] ttwu_do_activate [10.011629] sched_ttwu_pending (1) grabs rq_lock ... The above was reproduced on bpf-next (b338cf849ec8) by modifying ./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during ops.runnable(), and hacking the memcg accounting code a bit to make a bpf_timer_init() call more likely to raise an MEMCG_MAX event. We have also run into other similar variants (both internally and on bpf-next), including double-acquiring cgroup_file_kn_lock, the same worker_pool::lock, etc. As suggested by Shakeel, fix this by using __GFP_HIGH instead of GFP_ATOMIC in __bpf_async_init(), so that e.g. if try_charge_memcg() raises an MEMCG_MAX event, we call __memcg_memory_event() with @allow_spinning=false and avoid calling cgroup_file_notify() there. Depends on mm patch "memcg: skip cgroup_file_notify if spinning is not allowed": https://lore.kernel.org/bpf/20250905201606.66198-1-shakeel.butt@linux.dev/ v0 approach s/bpf_map_kmalloc_node/bpf_mem_alloc/ https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/ v1 approach: https://lore.kernel.org/bpf/20250905234547.862249-1-yepeilin@google.com/ Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Shakeel Butt Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20250909095222.2121438-1-yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9b0c5fe33f6..8af62cb243d9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; From 90f7c100d2dd99d5cd5be950d553edd2647e6cc8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sat, 6 Sep 2025 21:19:29 -0300 Subject: [PATCH 220/450] smb: client: fix compound alignment with encryption The encryption layer can't handle the padding iovs, so flatten the compound request into a single buffer with required padding to prevent the server from dropping the connection when finding unaligned compound requests. Fixes: bc925c1216f0 ("smb: client: improve compound padding in encryption") Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: linux-cifs@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 94b1d7a395d5..9f9955d274cf 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2640,13 +2640,35 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) } /* SMB headers in a compound are 8 byte aligned. */ - if (!IS_ALIGNED(len, 8)) { - num_padding = 8 - (len & 7); + if (IS_ALIGNED(len, 8)) + goto out; + + num_padding = 8 - (len & 7); + if (smb3_encryption_required(tcon)) { + int i; + + /* + * Flatten request into a single buffer with required padding as + * the encryption layer can't handle the padding iovs. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + rqst->rq_nvec = 1; + } else { rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len += num_padding; } + len += num_padding; +out: shdr->NextCommand = cpu_to_le32(len); } From e0d1c55501d377163eb57feed863777ed1c973ad Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sun, 7 Sep 2025 21:44:01 +0100 Subject: [PATCH 221/450] net: phy: fix phy_uses_state_machine() The blamed commit changed the conditions which phylib uses to stop and start the state machine in the suspend and resume paths, and while improving it, has caused two issues. The original code used this test: phydev->attached_dev && phydev->adjust_link and if true, the paths would handle the PHY state machine. This test evaluates true for normal drivers that are using phylib directly while the PHY is attached to the network device, but false in all other cases, which include the following cases: - when the PHY has never been attached to a network device. - when the PHY has been detached from a network device (as phy_detach() sets phydev->attached_dev to NULL, phy_disconnect() calls phy_detach() and additionally sets phydev->adjust_link NULL.) - when phylink is using the driver (as phydev->adjust_link is NULL.) Only the third case was incorrect, and the blamed commit attempted to fix this by changing this test to (simplified for brevity, see phy_uses_state_machine()): phydev->phy_link_change == phy_link_change ? phydev->attached_dev && phydev->adjust_link : true However, this also incorrectly evaluates true in the first two cases. Fix the first case by ensuring that phy_uses_state_machine() returns false when phydev->phy_link_change is NULL. Fix the second case by ensuring that phydev->phy_link_change is set to NULL when phy_detach() is called. Reported-by: Xu Yang Link: https://lore.kernel.org/r/20250806082931.3289134-1-xu.yang_2@nxp.com Fixes: fc75ea20ffb4 ("net: phy: allow MDIO bus PM ops to start/stop state machine for phylink-controlled PHY") Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1uvMEz-00000003Aoe-3qWe@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7556aa3dd7ee..c82c1997147b 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -287,8 +287,7 @@ static bool phy_uses_state_machine(struct phy_device *phydev) if (phydev->phy_link_change == phy_link_change) return phydev->attached_dev && phydev->adjust_link; - /* phydev->phy_link_change is implicitly phylink_phy_change() */ - return true; + return !!phydev->phy_link_change; } static bool mdio_bus_phy_may_suspend(struct phy_device *phydev) @@ -1864,6 +1863,8 @@ void phy_detach(struct phy_device *phydev) phydev->attached_dev = NULL; phy_link_topo_del_phy(dev, phydev); } + + phydev->phy_link_change = NULL; phydev->phylink = NULL; if (!phydev->is_on_sfp_module) From c5ea3065586d790ea5193a679b85585173d59866 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 7 Sep 2025 21:24:06 -0300 Subject: [PATCH 222/450] smb: client: fix data loss due to broken rename(2) Rename of open files in SMB2+ has been broken for a very long time, resulting in data loss as the CIFS client would fail the rename(2) call with -ENOENT and then removing the target file. Fix this by implementing ->rename_pending_delete() for SMB2+, which will rename busy files to random filenames (e.g. silly rename) during unlink(2) or rename(2), and then marking them to delete-on-close. Besides, introduce a FIND_WR_NO_PENDING_DELETE flag to prevent open(2) from reusing open handles that had been marked as delete pending. Handle it in cifs_get_readable_path() as well. Reported-by: Jean-Baptiste Denis Closes: https://marc.info/?i=16aeb380-30d4-4551-9134-4e7d1dc833c0@pasteur.fr Reviewed-by: David Howells Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Cc: Frank Sorenson Cc: Olga Kornievskaia Cc: Benjamin Coddington Cc: Scott Mayhew Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 13 ++- fs/smb/client/file.c | 18 +++- fs/smb/client/inode.c | 86 ++++++++++++---- fs/smb/client/smb2glob.h | 3 +- fs/smb/client/smb2inode.c | 204 ++++++++++++++++++++++++++++++-------- fs/smb/client/smb2ops.c | 4 + fs/smb/client/smb2proto.h | 3 + fs/smb/client/trace.h | 9 +- 8 files changed, 268 insertions(+), 72 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 1e64a4fb6af0..0fae95cf81c4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -87,7 +87,7 @@ #define SMB_INTERFACE_POLL_INTERVAL 600 /* maximum number of PDUs in one compound */ -#define MAX_COMPOUND 7 +#define MAX_COMPOUND 10 /* * Default number of credits to keep available for SMB3. @@ -1882,9 +1882,12 @@ static inline bool is_replayable_error(int error) /* cifs_get_writable_file() flags */ -#define FIND_WR_ANY 0 -#define FIND_WR_FSUID_ONLY 1 -#define FIND_WR_WITH_DELETE 2 +enum cifs_writable_file_flags { + FIND_WR_ANY = 0U, + FIND_WR_FSUID_ONLY = (1U << 0), + FIND_WR_WITH_DELETE = (1U << 1), + FIND_WR_NO_PENDING_DELETE = (1U << 2), +}; #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 @@ -2343,6 +2346,8 @@ struct smb2_compound_vars { struct kvec qi_iov; struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec unlink_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec rename_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; struct smb2_file_rename_info_hdr rename_info; struct smb2_file_link_info_hdr link_info; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 186e061068be..cb907e18cc35 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -998,7 +998,10 @@ int cifs_open(struct inode *inode, struct file *file) /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { - rc = cifs_get_writable_path(tcon, full_path, FIND_WR_FSUID_ONLY, &cfile); + rc = cifs_get_writable_path(tcon, full_path, + FIND_WR_FSUID_ONLY | + FIND_WR_NO_PENDING_DELETE, + &cfile); } else { rc = cifs_get_readable_path(tcon, full_path, &cfile); } @@ -2530,6 +2533,9 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, continue; if (with_delete && !(open_file->fid.access & DELETE)) continue; + if ((flags & FIND_WR_NO_PENDING_DELETE) && + open_file->status_file_deleted) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2647,6 +2653,16 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, spin_unlock(&tcon->open_file_lock); free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); + if (*ret_file) { + spin_lock(&cinode->open_file_lock); + if ((*ret_file)->status_file_deleted) { + spin_unlock(&cinode->open_file_lock); + cifsFileInfo_put(*ret_file); + *ret_file = NULL; + } else { + spin_unlock(&cinode->open_file_lock); + } + } return *ret_file ? 0 : -ENOENT; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index fe453a4b3dc8..11d442e8b3d6 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1931,7 +1931,7 @@ cifs_drop_nlink(struct inode *inode) * but will return the EACCES to the caller. Note that the VFS does not call * unlink on negative dentries currently. */ -int cifs_unlink(struct inode *dir, struct dentry *dentry) +static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyrename) { int rc = 0; unsigned int xid; @@ -2003,7 +2003,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto psx_del_no_retry; } - rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); + if (sillyrename || (server->vals->protocol_id > SMB10_PROT_ID && + d_is_positive(dentry) && d_count(dentry) > 2)) + rc = -EBUSY; + else + rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); psx_del_no_retry: if (!rc) { @@ -2071,6 +2075,11 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) return rc; } +int cifs_unlink(struct inode *dir, struct dentry *dentry) +{ + return __cifs_unlink(dir, dentry, false); +} + static int cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -2358,14 +2367,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb); cifs_put_tlink(tlink); + cifsInode = CIFS_I(d_inode(direntry)); + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags); spin_lock(&d_inode(direntry)->i_lock); i_size_write(d_inode(direntry), 0); clear_nlink(d_inode(direntry)); spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -2458,8 +2469,11 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ do_rename_exit: - if (rc == 0) + if (rc == 0) { d_move(from_dentry, to_dentry); + /* Force a new lookup */ + d_drop(from_dentry); + } cifs_put_tlink(tlink); return rc; } @@ -2470,6 +2484,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct dentry *target_dentry, unsigned int flags) { const char *from_name, *to_name; + struct TCP_Server_Info *server; void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; @@ -2505,6 +2520,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + server = tcon->ses->server; page1 = alloc_dentry_path(); page2 = alloc_dentry_path(); @@ -2591,19 +2607,53 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, unlink_target: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ - - /* Try unlinking the target dentry if it's not negative */ - if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { - if (d_is_dir(target_dentry)) - tmprc = cifs_rmdir(target_dir, target_dentry); - else - tmprc = cifs_unlink(target_dir, target_dentry); - if (tmprc) - goto cifs_rename_exit; - rc = cifs_do_rename(xid, source_dentry, from_name, - target_dentry, to_name); - if (!rc) - rehash = false; + if (d_really_is_positive(target_dentry)) { + if (!rc) { + struct inode *inode = d_inode(target_dentry); + /* + * Samba and ksmbd servers allow renaming a target + * directory that is open, so make sure to update + * ->i_nlink and then mark it as delete pending. + */ + if (S_ISDIR(inode->i_mode)) { + drop_cached_dir_by_name(xid, tcon, to_name, cifs_sb); + spin_lock(&inode->i_lock); + i_size_write(inode, 0); + clear_nlink(inode); + spin_unlock(&inode->i_lock); + set_bit(CIFS_INO_DELETE_PENDING, &CIFS_I(inode)->flags); + CIFS_I(inode)->time = 0; /* force reval */ + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + } + } else if (rc == -EACCES || rc == -EEXIST) { + /* + * Rename failed, possibly due to a busy target. + * Retry it by unliking the target first. + */ + if (d_is_dir(target_dentry)) { + tmprc = cifs_rmdir(target_dir, target_dentry); + } else { + tmprc = __cifs_unlink(target_dir, target_dentry, + server->vals->protocol_id > SMB10_PROT_ID); + } + if (tmprc) { + /* + * Some servers will return STATUS_ACCESS_DENIED + * or STATUS_DIRECTORY_NOT_EMPTY when failing to + * rename a non-empty directory. Make sure to + * propagate the appropriate error back to + * userspace. + */ + if (tmprc == -EEXIST || tmprc == -ENOTEMPTY) + rc = tmprc; + goto cifs_rename_exit; + } + rc = cifs_do_rename(xid, source_dentry, from_name, + target_dentry, to_name); + if (!rc) + rehash = false; + } } /* force revalidate to go get info when needed */ @@ -2629,6 +2679,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct cached_fid *cfid = NULL; + if (test_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags)) + return false; if (cifs_i->time == 0) return true; diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 224495322a05..e56e4d402f13 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -30,10 +30,9 @@ enum smb2_compound_ops { SMB2_OP_QUERY_DIR, SMB2_OP_MKDIR, SMB2_OP_RENAME, - SMB2_OP_DELETE, SMB2_OP_HARDLINK, SMB2_OP_SET_EOF, - SMB2_OP_RMDIR, + SMB2_OP_UNLINK, SMB2_OP_POSIX_QUERY_INFO, SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 31c13fb5b85b..7cadc8ca4f55 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -346,9 +346,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_DELETE: - trace_smb3_delete_enter(xid, tcon->tid, ses->Suid, full_path); - break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the @@ -356,23 +353,40 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, */ trace_smb3_mkdir_enter(xid, tcon->tid, ses->Suid, full_path); break; - case SMB2_OP_RMDIR: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + case SMB2_OP_UNLINK: + rqst[num_rqst].rq_iov = vars->unlink_iov; rqst[num_rqst].rq_nvec = 1; size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; - rc = SMB2_set_info_init(tcon, server, - &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID, current->tgid, - FILE_DISPOSITION_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); - if (rc) + if (cfile) { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } else { + rc = SMB2_set_info_init(tcon, server, + &rqst[num_rqst], + COMPOUND_FID, + COMPOUND_FID, + current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, + data, size); + } + if (!rc && (!cfile || num_rqst > 1)) { + smb2_set_next_command(tcon, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst]); + } else if (rc) { goto finished; - smb2_set_next_command(tcon, &rqst[num_rqst]); - smb2_set_related(&rqst[num_rqst++]); - trace_smb3_rmdir_enter(xid, tcon->tid, ses->Suid, full_path); + } + num_rqst++; + trace_smb3_unlink_enter(xid, tcon->tid, ses->Suid, full_path); break; case SMB2_OP_SET_EOF: rqst[num_rqst].rq_iov = &vars->si_iov[0]; @@ -442,7 +456,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, full_path); break; case SMB2_OP_RENAME: - rqst[num_rqst].rq_iov = &vars->si_iov[0]; + rqst[num_rqst].rq_iov = vars->rename_iov; rqst[num_rqst].rq_nvec = 2; len = in_iov[i].iov_len; @@ -732,19 +746,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_posix_query_info_compound_done(xid, tcon->tid, ses->Suid); break; - case SMB2_OP_DELETE: - if (rc) - trace_smb3_delete_err(xid, tcon->tid, ses->Suid, rc); - else { - /* - * If dentry (hence, inode) is NULL, lease break is going to - * take care of degrading leases on handles for deleted files. - */ - if (inode) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - trace_smb3_delete_done(xid, tcon->tid, ses->Suid); - } - break; case SMB2_OP_MKDIR: if (rc) trace_smb3_mkdir_err(xid, tcon->tid, ses->Suid, rc); @@ -765,11 +766,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_rename_done(xid, tcon->tid, ses->Suid); SMB2_set_info_free(&rqst[num_rqst++]); break; - case SMB2_OP_RMDIR: - if (rc) - trace_smb3_rmdir_err(xid, tcon->tid, ses->Suid, rc); + case SMB2_OP_UNLINK: + if (!rc) + trace_smb3_unlink_done(xid, tcon->tid, ses->Suid); else - trace_smb3_rmdir_done(xid, tcon->tid, ses->Suid); + trace_smb3_unlink_err(xid, tcon->tid, ses->Suid, rc); SMB2_set_info_free(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -1166,7 +1167,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE); return smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, NULL, - &(int){SMB2_OP_RMDIR}, 1, + &(int){SMB2_OP_UNLINK}, 1, NULL, NULL, NULL, NULL); } @@ -1175,20 +1176,29 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb, struct dentry *dentry) { struct cifs_open_parms oparms; + struct inode *inode = NULL; + int rc; - oparms = CIFS_OPARMS(cifs_sb, tcon, name, - DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - ACL_NO_MODE); - int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, dentry); + if (dentry) + inode = d_inode(dentry); + + oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE, + FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE); + rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, + NULL, &(int){SMB2_OP_UNLINK}, + 1, NULL, NULL, NULL, dentry); if (rc == -EINVAL) { cifs_dbg(FYI, "invalid lease key, resending request without lease"); rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms, - NULL, &(int){SMB2_OP_DELETE}, 1, - NULL, NULL, NULL, NULL); + NULL, &(int){SMB2_OP_UNLINK}, + 1, NULL, NULL, NULL, NULL); } + /* + * If dentry (hence, inode) is NULL, lease break is going to + * take care of degrading leases on handles for deleted files. + */ + if (!rc && inode) + cifs_mark_open_handles_for_deleted_file(inode, name); return rc; } @@ -1441,3 +1451,113 @@ int smb2_query_reparse_point(const unsigned int xid, cifs_free_open_info(&data); return rc; } + +static inline __le16 *utf16_smb2_path(struct cifs_sb_info *cifs_sb, + const char *name, size_t namelen) +{ + int len; + + if (*name == '\\' || + (cifs_sb_master_tlink(cifs_sb) && + cifs_sb_master_tcon(cifs_sb)->posix_extensions && *name == '/')) + name++; + return cifs_strndup_to_utf16(name, namelen, &len, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); +} + +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(d_inode(dentry)->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + __le16 *utf16_path __free(kfree) = NULL; + __u32 co = file_create_options(dentry); + int cmds[] = { + SMB2_OP_SET_INFO, + SMB2_OP_RENAME, + SMB2_OP_UNLINK, + }; + const int num_cmds = ARRAY_SIZE(cmds); + char *to_name __free(kfree) = NULL; + __u32 attrs = cinode->cifsAttrs; + struct cifs_open_parms oparms; + static atomic_t sillycounter; + struct cifsFileInfo *cfile; + struct tcon_link *tlink; + struct cifs_tcon *tcon; + struct kvec iov[2]; + const char *ppath; + void *page; + size_t len; + int rc; + + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); + + page = alloc_dentry_path(); + + ppath = build_path_from_dentry(dentry->d_parent, page); + if (IS_ERR(ppath)) { + rc = PTR_ERR(ppath); + goto out; + } + + len = strlen(ppath) + strlen("/.__smb1234") + 1; + to_name = kmalloc(len, GFP_KERNEL); + if (!to_name) { + rc = -ENOMEM; + goto out; + } + + scnprintf(to_name, len, "%s%c.__smb%04X", ppath, CIFS_DIR_SEP(cifs_sb), + atomic_inc_return(&sillycounter) & 0xffff); + + utf16_path = utf16_smb2_path(cifs_sb, to_name, len); + if (!utf16_path) { + rc = -ENOMEM; + goto out; + } + + drop_cached_dir_by_name(xid, tcon, full_path, cifs_sb); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, + DELETE | FILE_WRITE_ATTRIBUTES, + FILE_OPEN, co, ACL_NO_MODE); + + attrs &= ~ATTR_READONLY; + if (!attrs) + attrs = ATTR_NORMAL; + if (d_inode(dentry)->i_nlink <= 1) + attrs |= ATTR_HIDDEN; + iov[0].iov_base = &(FILE_BASIC_INFO) { + .Attributes = cpu_to_le32(attrs), + }; + iov[0].iov_len = sizeof(FILE_BASIC_INFO); + iov[1].iov_base = utf16_path; + iov[1].iov_len = sizeof(*utf16_path) * UniStrlen((wchar_t *)utf16_path); + + cifs_get_writable_path(tcon, full_path, FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, dentry); + if (rc == -EINVAL) { + cifs_dbg(FYI, "invalid lease key, resending request without lease\n"); + cifs_get_writable_path(tcon, full_path, + FIND_WR_WITH_DELETE, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, iov, + cmds, num_cmds, cfile, NULL, NULL, NULL); + } + if (!rc) { + set_bit(CIFS_INO_DELETE_PENDING, &cinode->flags); + } else { + cifs_tcon_dbg(FYI, "%s: failed to rename '%s' to '%s': %d\n", + __func__, full_path, to_name, rc); + rc = -EIO; + } +out: + cifs_put_tlink(tlink); + free_dentry_path(page); + return rc; +} diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 9f9955d274cf..e586f3f4b5c9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5398,6 +5398,7 @@ struct smb_version_operations smb20_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ @@ -5503,6 +5504,7 @@ struct smb_version_operations smb21_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb30_operations = { @@ -5619,6 +5621,7 @@ struct smb_version_operations smb30_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; struct smb_version_operations smb311_operations = { @@ -5735,6 +5738,7 @@ struct smb_version_operations smb311_operations = { .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, .is_network_name_deleted = smb2_is_network_name_deleted, + .rename_pending_delete = smb2_rename_pending_delete, }; #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 6e805ece6a7b..b3f1398c9f79 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -317,5 +317,8 @@ int posix_info_sid_size(const void *beg, const void *end); int smb2_make_nfs_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); +int smb2_rename_pending_delete(const char *full_path, + struct dentry *dentry, + const unsigned int xid); #endif /* _SMB2PROTO_H */ diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index fe0e075bc63c..fd650e2afc76 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -669,13 +669,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(posix_query_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(unlink_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_wsl_ea_compound_enter); -DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter); DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter); @@ -710,13 +709,12 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(posix_query_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(unlink_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done); -DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done); DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done); @@ -756,14 +754,13 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(posix_query_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(unlink_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); -DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err); DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err); From 686cab5a18e443e1d5f2abb17bed45837836425f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 7 Sep 2025 11:08:21 +0300 Subject: [PATCH 223/450] net: dev_ioctl: take ops lock in hwtstamp lower paths ndo hwtstamp callbacks are expected to run under the per-device ops lock. Make the lower get/set paths consistent with the rest of ndo invocations. Kernel log: WARNING: CPU: 13 PID: 51364 at ./include/net/netdev_lock.h:70 __netdev_update_features+0x4bd/0xe60 ... RIP: 0010:__netdev_update_features+0x4bd/0xe60 ... Call Trace: netdev_update_features+0x1f/0x60 mlx5_hwtstamp_set+0x181/0x290 [mlx5_core] mlx5e_hwtstamp_set+0x19/0x30 [mlx5_core] dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp+0x13d/0x240 dev_ioctl+0x12f/0x4b0 sock_ioctl+0x171/0x370 __x64_sys_ioctl+0x3f7/0x900 ? __sys_setsockopt+0x69/0xb0 do_syscall_64+0x6f/0x2e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ... .... ---[ end trace 0000000000000000 ]--- Note that the mlx5_hwtstamp_set and mlx5e_hwtstamp_set functions shown in the trace come from an in progress patch converting the legacy ioctl to ndo_hwtstamp_get/set and are not present in mainline. Fixes: ffb7ed19ac0a ("net: hold netdev instance lock during ioctl operations") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20250907080821.2353388-1-cjubran@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev_ioctl.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..ad54b12d4b4c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); From 0f82c3ba66c6b2e3cde0f255156a753b108ee9dc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 8 Sep 2025 10:36:14 -0700 Subject: [PATCH 224/450] macsec: sync features on RTM_NEWLINK Syzkaller managed to lock the lower device via ETHTOOL_SFEATURES: netdev_lock include/linux/netdevice.h:2761 [inline] netdev_lock_ops include/net/netdev_lock.h:42 [inline] netdev_sync_lower_features net/core/dev.c:10649 [inline] __netdev_update_features+0xcb1/0x1be0 net/core/dev.c:10819 netdev_update_features+0x6d/0xe0 net/core/dev.c:10876 macsec_notify+0x2f5/0x660 drivers/net/macsec.c:4533 notifier_call_chain+0x1b3/0x3e0 kernel/notifier.c:85 call_netdevice_notifiers_extack net/core/dev.c:2267 [inline] call_netdevice_notifiers net/core/dev.c:2281 [inline] netdev_features_change+0x85/0xc0 net/core/dev.c:1570 __dev_ethtool net/ethtool/ioctl.c:3469 [inline] dev_ethtool+0x1536/0x19b0 net/ethtool/ioctl.c:3502 dev_ioctl+0x392/0x1150 net/core/dev_ioctl.c:759 It happens because lower features are out of sync with the upper: __dev_ethtool (real_dev) netdev_lock_ops(real_dev) ETHTOOL_SFEATURES __netdev_features_change netdev_sync_upper_features disable LRO on the lower if (old_features != dev->features) netdev_features_change fires NETDEV_FEAT_CHANGE macsec_notify NETDEV_FEAT_CHANGE netdev_update_features (for each macsec dev) netdev_sync_lower_features if (upper_features != lower_features) netdev_lock_ops(lower) # lower == real_dev stuck ... netdev_unlock_ops(real_dev) Per commit af5f54b0ef9e ("net: Lock lower level devices when updating features"), we elide the lock/unlock when the upper and lower features are synced. Makes sure the lower (real_dev) has proper features after the macsec link has been created. This makes sure we never hit the situation where we need to sync upper flags to the lower. Reported-by: syzbot+7e0f89fb6cae5d002de0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7e0f89fb6cae5d002de0 Fixes: 7e4d784f5810 ("net: hold netdev instance lock during rtnetlink operations") Signed-off-by: Stanislav Fomichev Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250908173614.3358264-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 01329fe7451a..0eca96eeed58 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4286,6 +4286,7 @@ static int macsec_newlink(struct net_device *dev, if (err < 0) goto del_dev; + netdev_update_features(dev); netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); From 648de37416b301f046f62f1b65715c7fa8ebaa67 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 8 Sep 2025 11:16:01 -0700 Subject: [PATCH 225/450] mptcp: sockopt: make sync_socket_options propagate SOCK_KEEPOPEN Users reported a scenario where MPTCP connections that were configured with SO_KEEPALIVE prior to connect would fail to enable their keepalives if MTPCP fell back to TCP mode. After investigating, this affects keepalives for any connection where sync_socket_options is called on a socket that is in the closed or listening state. Joins are handled properly. For connects, sync_socket_options is called when the socket is still in the closed state. The tcp_set_keepalive() function does not act on sockets that are closed or listening, hence keepalive is not immediately enabled. Since the SO_KEEPOPEN flag is absent, it is not enabled later in the connect sequence via tcp_finish_connect. Setting the keepalive via sockopt after connect does work, but would not address any subsequently created flows. Fortunately, the fix here is straight-forward: set SOCK_KEEPOPEN on the subflow when calling sync_socket_options. The fix was valdidated both by using tcpdump to observe keepalive packets not being sent before the fix, and being sent after the fix. It was also possible to observe via ss that the keepalive timer was not enabled on these sockets before the fix, but was enabled afterwards. Fixes: 1b3e7ede1365 ("mptcp: setsockopt: handle SO_KEEPALIVE and SO_PRIORITY") Cc: stable@vger.kernel.org Signed-off-by: Krister Johansen Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/aL8dYfPZrwedCIh9@templeofstupid.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95be..2abe6f1e9940 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; From 7094b84863e5832cb1cd9c4b9d648904775b6bd9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:27 +0200 Subject: [PATCH 226/450] netlink: specs: mptcp: fix if-idx attribute type This attribute is used as a signed number in the code in pm_netlink.c: nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) The specs should then reflect that. Note that other 'if-idx' attributes from the same .yaml file use a signed number as well. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-1-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 02f1ddcfbf1c..d15335684ec3 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -256,7 +256,7 @@ attribute-sets: type: u32 - name: if-idx - type: u32 + type: s32 - name: reset-reason type: u32 From 6f021e95d0828edc8ed104a294594c2f9569383a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:28 +0200 Subject: [PATCH 227/450] doc: mptcp: net.mptcp.pm_type is deprecated The net.mptcp.pm_type sysctl knob has been deprecated in v6.15, net.mptcp.path_manager should be used instead. Adapt the section about path managers to suggest using the new sysctl knob instead of the deprecated one. Fixes: 595c26d122d1 ("mptcp: sysctl: set path manager by name") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-2-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst index 17f2bab61164..2e31038d6462 100644 --- a/Documentation/networking/mptcp.rst +++ b/Documentation/networking/mptcp.rst @@ -60,10 +60,10 @@ address announcements. Typically, it is the client side that initiates subflows, and the server side that announces additional addresses via the ``ADD_ADDR`` and ``REMOVE_ADDR`` options. -Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see -mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the -same rules are applied for all the connections (see: ``ip mptcp``) ; and the -userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd +Path managers are controlled by the ``net.mptcp.path_manager`` sysctl knob -- +see mptcp-sysctl.rst. There are two types: the in-kernel one (``kernel``) where +the same rules are applied for all the connections (see: ``ip mptcp``) ; and the +userspace one (``userspace``), controlled by a userspace daemon (i.e. `mptcpd `_) where different rules can be applied for each connection. The path managers can be controlled via a Netlink API; see netlink_spec/mptcp_pm.rst. From ef1bd93b3b924086088b7818d9e5d89ede944f1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 23:27:29 +0200 Subject: [PATCH 228/450] selftests: mptcp: shellcheck: support v0.11.0 This v0.11.0 version introduces SC2329: Warn when (non-escaping) functions are never invoked. Except that, similar to SC2317, ShellCheck is currently unable to figure out functions that are invoked via trap, or indirectly, when calling functions via variables. It is then needed to disable this new SC2329. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250908-net-mptcp-misc-fixes-6-17-rc5-v1-3-5f2168a66079@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_sockopt.sh | 2 +- tools/testing/selftests/net/mptcp/pm_netlink.sh | 5 +++-- tools/testing/selftests/net/mptcp/simult_flows.sh | 2 +- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 7 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 7a3cb4c09e45..d847ff1737c3 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -28,7 +28,7 @@ flush_pids() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 5e3c56253274..c2ab9f7f0d21 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -134,7 +134,7 @@ ns4="" TEST_GROUP="" # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cin_disconnect" diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 82cae37d9c20..7fd555b123b9 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,7 +8,7 @@ # ShellCheck incorrectly believes that most of the code here is unreachable # because it's invoked by variable name, see how the "tests" array is used -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 . "$(dirname "${0}")/mptcp_lib.sh" diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh index 418a903c3a4d..f01989be6e9b 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.sh @@ -95,7 +95,7 @@ init() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { mptcp_lib_ns_exit "${ns1}" "${ns2}" "${ns_sbox}" diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index ac7ec6f94023..ec6a87588191 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -32,7 +32,7 @@ ns1="" err=$(mktemp) # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "${err}" @@ -70,8 +70,9 @@ format_endpoints() { mptcp_lib_pm_nl_format_endpoints "${@}" } +# This function is invoked indirectly +#shellcheck disable=SC2317,SC2329 get_endpoint() { - # shellcheck disable=SC2317 # invoked indirectly mptcp_lib_pm_nl_get_endpoint "${ns1}" "${@}" } diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 2329c2f8519b..1903e8e84a31 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -35,7 +35,7 @@ usage() { } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { rm -f "$cout" "$sout" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 333064b0b5ac..970c329735ff 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -94,7 +94,7 @@ test_fail() } # This function is used in the cleanup trap -#shellcheck disable=SC2317 +#shellcheck disable=SC2317,SC2329 cleanup() { print_title "Cleanup" From 5cb782ff3c62c837e4984b6ae9f5d9a423cd5088 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sun, 7 Sep 2025 12:40:16 -0700 Subject: [PATCH 229/450] scsi: ufs: mcq: Fix memory allocation checks for SQE and CQE Previous checks incorrectly tested the DMA addresses (dma_handle) for NULL. Since dma_alloc_coherent() returns the CPU (virtual) address, the NULL check should be performed on the *_base_addr pointer to correctly detect allocation failures. Update the checks to validate sqe_base_addr and cqe_base_addr instead of sqe_dma_addr and cqe_dma_addr. Fixes: 4682abfae2eb ("scsi: ufs: core: mcq: Allocate memory for MCQ mode") Signed-off-by: Alok Tiwari Reviewed-by: Alim Akhtar Reviewed-by: Manivannan Sadhasivam Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 1e50675772fe..cc88aaa106da 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -243,7 +243,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->sqe_base_addr = dmam_alloc_coherent(hba->dev, utrdl_size, &hwq->sqe_dma_addr, GFP_KERNEL); - if (!hwq->sqe_dma_addr) { + if (!hwq->sqe_base_addr) { dev_err(hba->dev, "SQE allocation failed\n"); return -ENOMEM; } @@ -252,7 +252,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba) hwq->cqe_base_addr = dmam_alloc_coherent(hba->dev, cqe_size, &hwq->cqe_dma_addr, GFP_KERNEL); - if (!hwq->cqe_dma_addr) { + if (!hwq->cqe_base_addr) { dev_err(hba->dev, "CQE allocation failed\n"); return -ENOMEM; } From 181993bb0d626cf88cc803f4356ce5c5abe86278 Mon Sep 17 00:00:00 2001 From: Yuezhang Mo Date: Wed, 10 Sep 2025 13:33:40 +0800 Subject: [PATCH 230/450] erofs: fix runtime warning on truncate_folio_batch_exceptionals() Commit 0e2f80afcfa6("fs/dax: ensure all pages are idle prior to filesystem unmount") introduced the WARN_ON_ONCE to capture whether the filesystem has removed all DAX entries or not and applied the fix to xfs and ext4. Apply the missed fix on erofs to fix the runtime warning: [ 5.266254] ------------[ cut here ]------------ [ 5.266274] WARNING: CPU: 6 PID: 3109 at mm/truncate.c:89 truncate_folio_batch_exceptionals+0xff/0x260 [ 5.266294] Modules linked in: [ 5.266999] CPU: 6 UID: 0 PID: 3109 Comm: umount Tainted: G S 6.16.0+ #6 PREEMPT(voluntary) [ 5.267012] Tainted: [S]=CPU_OUT_OF_SPEC [ 5.267017] Hardware name: Dell Inc. OptiPlex 5000/05WXFV, BIOS 1.5.1 08/24/2022 [ 5.267024] RIP: 0010:truncate_folio_batch_exceptionals+0xff/0x260 [ 5.267076] Code: 00 00 41 39 df 7f 11 eb 78 83 c3 01 49 83 c4 08 41 39 df 74 6c 48 63 f3 48 83 fe 1f 0f 83 3c 01 00 00 43 f6 44 26 08 01 74 df <0f> 0b 4a 8b 34 22 4c 89 ef 48 89 55 90 e8 ff 54 1f 00 48 8b 55 90 [ 5.267083] RSP: 0018:ffffc900013f36c8 EFLAGS: 00010202 [ 5.267095] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 5.267101] RDX: ffffc900013f3790 RSI: 0000000000000000 RDI: ffff8882a1407898 [ 5.267108] RBP: ffffc900013f3740 R08: 0000000000000000 R09: 0000000000000000 [ 5.267113] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 5.267119] R13: ffff8882a1407ab8 R14: ffffc900013f3888 R15: 0000000000000001 [ 5.267125] FS: 00007aaa8b437800(0000) GS:ffff88850025b000(0000) knlGS:0000000000000000 [ 5.267132] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5.267138] CR2: 00007aaa8b3aac10 CR3: 000000024f764000 CR4: 0000000000f52ef0 [ 5.267144] PKRU: 55555554 [ 5.267150] Call Trace: [ 5.267154] [ 5.267181] truncate_inode_pages_range+0x118/0x5e0 [ 5.267193] ? save_trace+0x54/0x390 [ 5.267296] truncate_inode_pages_final+0x43/0x60 [ 5.267309] evict+0x2a4/0x2c0 [ 5.267339] dispose_list+0x39/0x80 [ 5.267352] evict_inodes+0x150/0x1b0 [ 5.267376] generic_shutdown_super+0x41/0x180 [ 5.267390] kill_block_super+0x1b/0x50 [ 5.267402] erofs_kill_sb+0x81/0x90 [erofs] [ 5.267436] deactivate_locked_super+0x32/0xb0 [ 5.267450] deactivate_super+0x46/0x60 [ 5.267460] cleanup_mnt+0xc3/0x170 [ 5.267475] __cleanup_mnt+0x12/0x20 [ 5.267485] task_work_run+0x5d/0xb0 [ 5.267499] exit_to_user_mode_loop+0x144/0x170 [ 5.267512] do_syscall_64+0x2b9/0x7c0 [ 5.267523] ? __lock_acquire+0x665/0x2ce0 [ 5.267535] ? __lock_acquire+0x665/0x2ce0 [ 5.267560] ? lock_acquire+0xcd/0x300 [ 5.267573] ? find_held_lock+0x31/0x90 [ 5.267582] ? mntput_no_expire+0x97/0x4e0 [ 5.267606] ? mntput_no_expire+0xa1/0x4e0 [ 5.267625] ? mntput+0x24/0x50 [ 5.267634] ? path_put+0x1e/0x30 [ 5.267647] ? do_faccessat+0x120/0x2f0 [ 5.267677] ? do_syscall_64+0x1a2/0x7c0 [ 5.267686] ? from_kgid_munged+0x17/0x30 [ 5.267703] ? from_kuid_munged+0x13/0x30 [ 5.267711] ? __do_sys_getuid+0x3d/0x50 [ 5.267724] ? do_syscall_64+0x1a2/0x7c0 [ 5.267732] ? irqentry_exit+0x77/0xb0 [ 5.267743] ? clear_bhb_loop+0x30/0x80 [ 5.267752] ? clear_bhb_loop+0x30/0x80 [ 5.267765] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 5.267772] RIP: 0033:0x7aaa8b32a9fb [ 5.267781] Code: c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 f3 0f 1e fa 31 f6 e9 05 00 00 00 0f 1f 44 00 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 e9 83 0d 00 f7 d8 [ 5.267787] RSP: 002b:00007ffd7c4c9468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [ 5.267796] RAX: 0000000000000000 RBX: 00005a61592a8b00 RCX: 00007aaa8b32a9fb [ 5.267802] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00005a61592b2080 [ 5.267806] RBP: 00007ffd7c4c9540 R08: 00007aaa8b403b20 R09: 0000000000000020 [ 5.267812] R10: 0000000000000001 R11: 0000000000000246 R12: 00005a61592a8c00 [ 5.267817] R13: 0000000000000000 R14: 00005a61592b2080 R15: 00005a61592a8f10 [ 5.267849] [ 5.267854] irq event stamp: 4721 [ 5.267859] hardirqs last enabled at (4727): [] __up_console_sem+0x90/0xa0 [ 5.267873] hardirqs last disabled at (4732): [] __up_console_sem+0x75/0xa0 [ 5.267884] softirqs last enabled at (3044): [] kernel_fpu_end+0x53/0x70 [ 5.267895] softirqs last disabled at (3042): [] kernel_fpu_begin_mask+0xc4/0x120 [ 5.267905] ---[ end trace 0000000000000000 ]--- Fixes: bde708f1a65d ("fs/dax: always remove DAX page-cache entries when breaking layouts") Signed-off-by: Yuezhang Mo Reviewed-by: Friendy Su Reviewed-by: Daniel Palmer Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 1b529ace4db0..db13b40a78e0 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -1018,10 +1018,22 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) return 0; } +static void erofs_evict_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + dax_break_layout_final(inode); +#endif + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, .free_inode = erofs_free_inode, + .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, }; From 8cc71fc3b82b51e155fbe20876b1aa17a315ac4c Mon Sep 17 00:00:00 2001 From: Nithyanantham Paramasivam Date: Fri, 5 Sep 2025 18:18:00 +0530 Subject: [PATCH 231/450] wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO Currently, nl80211_get_station() allocates a fixed buffer size using NLMSG_DEFAULT_SIZE. In multi-link scenarios - particularly when the number of links exceeds two - this buffer size is often insufficient to accommodate complete station statistics, resulting in "no buffer space available" errors. To address this, modify nl80211_get_station() to return only accumulated station statistics and exclude per link stats. Pass a new flag (link_stats) to nl80211_send_station() to control the inclusion of per link statistics. This allows retaining detailed output with per link data in dump commands, while excluding it from other commands where it is not needed. This change modifies the handling of per link stats introduced in commit 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") to enable them only for nl80211_dump_station(). Apply the same fix to cfg80211_del_sta_sinfo() by skipping per link stats to avoid buffer issues. cfg80211_new_sta() doesn't include stats and is therefore not impacted. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Nithyanantham Paramasivam Link: https://patch.msgid.link/20250905124800.1448493-1-nithyanantham.paramasivam@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..f2f7424e930c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, true) < 0) goto out; sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } From e0423541477dfb684fbc6e6b5386054bc650f264 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Sep 2025 15:44:45 +0200 Subject: [PATCH 232/450] PM: EM: Add function for registering a PD without capacity update The intel_pstate driver manages CPU capacity changes itself and it does not need an update of the capacity of all CPUs in the system to be carried out after registering a PD. Moreover, in some configurations (for instance, an SMT-capable hybrid x86 system booted with nosmt in the kernel command line) the em_check_capacity_update() call at the end of em_dev_register_perf_domain() always fails and reschedules itself to run once again in 1 s, so effectively it runs in vain every 1 s forever. To address this, introduce a new variant of em_dev_register_perf_domain(), called em_dev_register_pd_no_update(), that does not invoke em_check_capacity_update(), and make intel_pstate use it instead of the original. Fixes: 7b010f9b9061 ("cpufreq: intel_pstate: EAS support for hybrid platforms") Closes: https://lore.kernel.org/linux-pm/40212796-734c-4140-8a85-854f72b8144d@panix.com/ Reported-by: Kenneth R. Crudup Tested-by: Kenneth R. Crudup Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 ++-- include/linux/energy_model.h | 10 ++++++++++ kernel/power/energy_model.c | 29 +++++++++++++++++++++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f366d35c5840..0d5d283a5429 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1034,8 +1034,8 @@ static bool hybrid_register_perf_domain(unsigned int cpu) if (!cpu_dev) return false; - if (em_dev_register_perf_domain(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, - cpumask_of(cpu), false)) + if (em_dev_register_pd_no_update(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, + cpumask_of(cpu), false)) return false; cpudata->pd_registered = true; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 7fa1eb3cc823..61d50571ad88 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -171,6 +171,9 @@ int em_dev_update_perf_domain(struct device *dev, int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts); +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table *table); @@ -350,6 +353,13 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, { return -EINVAL; } +static inline +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + return -EINVAL; +} static inline void em_dev_unregister_perf_domain(struct device *dev) { } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index ea7995a25780..8df55397414a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -552,6 +552,30 @@ EXPORT_SYMBOL_GPL(em_cpu_get); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) +{ + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; @@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); - if (_is_cpu_device(dev)) - em_check_capacity_update(); - return ret; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device From 8822e8be86d40410ddd2ac8ff44f3050c9ecf9c6 Mon Sep 17 00:00:00 2001 From: aprilgrimoire Date: Sun, 7 Sep 2025 09:06:11 +0000 Subject: [PATCH 233/450] platform/x86/amd/pmc: Add MECHREVO Yilong15Pro to spurious_8042 list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware of Mechrevo Yilong15Pro emits a spurious keyboard interrupt on events including closing the lid. When a user closes the lid on an already suspended system this causes the system to wake up. Add Mechrevo Yilong15Pro Series (GM5HG7A) to the list of quirk spurious_8042 to work around this issue. Link: https://lore.kernel.org/linux-pm/6ww4uu6Gl4F5n6VY5dl1ufASfKzs4DhMxAN8BuqUpCoqU3PQukVSVSBCl_lKIzkQ-S8kt1acPd58eyolhkWN32lMLFj4ViI0Tdu2jwhnYZ8=@proton.me/ Signed-off-by: April Grimoire Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/IvSc_IN5Pa0wRXElTk_fEl-cTpMZxg6TCQk_7aRUkTd9vJUp_ZeC0NdXZ0z6Tn7B-XiqqqQvCH65lq6FqhuECBMEYWcHQmWm1Jo7Br8kpeg=@proton.me Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc-quirks.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c index 18fb44139de2..4d0a38e06f08 100644 --- a/drivers/platform/x86/amd/pmc/pmc-quirks.c +++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c @@ -239,6 +239,14 @@ static const struct dmi_system_id fwbug_list[] = { DMI_MATCH(DMI_BOARD_NAME, "WUJIE14-GX4HRXL"), } }, + { + .ident = "MECHREVO Yilong15Pro Series GM5HG7A", + .driver_data = &quirk_spurious_8042, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "MECHREVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "Yilong15Pro Series GM5HG7A"), + } + }, /* https://bugzilla.kernel.org/show_bug.cgi?id=220116 */ { .ident = "PCSpecialist Lafite Pro V 14M", From fba9d5448bd45b0ff7199c47023e9308ea4f1730 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Thu, 4 Sep 2025 15:22:51 +0200 Subject: [PATCH 234/450] platform/x86: oxpec: Add support for OneXPlayer X1Pro EVA-02 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is a special edition of X1Pro with a different color. Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250904132252.3041613-1-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/oxpec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c index eb076bb4099b..4f540a9932fe 100644 --- a/drivers/platform/x86/oxpec.c +++ b/drivers/platform/x86/oxpec.c @@ -306,6 +306,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)oxp_x1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ONE-NETBOOK"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "ONEXPLAYER X1Pro EVA-02"), + }, + .driver_data = (void *)oxp_x1, + }, {}, }; From d857d09fb653f081f5730e5549fce397513b0ef9 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Thu, 4 Sep 2025 15:22:52 +0200 Subject: [PATCH 235/450] platform/x86: oxpec: Add support for AOKZOE A1X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Very similar to OneXFly devices. Uses the same registers. Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250904132252.3041613-2-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/oxpec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/platform/x86/oxpec.c b/drivers/platform/x86/oxpec.c index 4f540a9932fe..54377b282ff8 100644 --- a/drivers/platform/x86/oxpec.c +++ b/drivers/platform/x86/oxpec.c @@ -124,6 +124,13 @@ static const struct dmi_system_id dmi_table[] = { }, .driver_data = (void *)aok_zoe_a1, }, + { + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "AOKZOE"), + DMI_EXACT_MATCH(DMI_BOARD_NAME, "AOKZOE A1X"), + }, + .driver_data = (void *)oxp_fly, + }, { .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "AYANEO"), From c45601306aa5831c3e59158f95b8e34f27e9ea09 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 26 Jul 2025 14:29:30 +0200 Subject: [PATCH 236/450] um: Don't mark stack executable On one of my machines UML failed to start after enabling SELinux. UML failed to start because SELinux's execmod rule denies executable pages on a modified file mapping. Historically UML marks it's stack rwx. AFAICT, these days this is no longer needed, so let's remove PROT_EXEC. Signed-off-by: Richard Weinberger Signed-off-by: Johannes Berg --- arch/um/os-Linux/util.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 4193e04d7e4a..e3ad71a0d13c 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -20,8 +20,7 @@ void stack_protections(unsigned long address) { - if (mprotect((void *) address, UM_THREAD_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC) < 0) + if (mprotect((void *) address, UM_THREAD_SIZE, PROT_READ | PROT_WRITE) < 0) panic("protecting stack failed, errno = %d", errno); } From 7ebf70cf181651fe3f2e44e95e7e5073d594c9c0 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 28 Aug 2025 15:00:51 +0800 Subject: [PATCH 237/450] um: virtio_uml: Fix use-after-free after put_device in probe When register_virtio_device() fails in virtio_uml_probe(), the code sets vu_dev->registered = 1 even though the device was not successfully registered. This can lead to use-after-free or other issues. Fixes: 04e5b1fb0183 ("um: virtio: Remove device on disconnect") Signed-off-by: Miaoqian Lin Signed-off-by: Johannes Berg --- arch/um/drivers/virtio_uml.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index ad8d78fb1d9a..de7867ae220d 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -1250,10 +1250,12 @@ static int virtio_uml_probe(struct platform_device *pdev) device_set_wakeup_capable(&vu_dev->vdev.dev, true); rc = register_virtio_device(&vu_dev->vdev); - if (rc) + if (rc) { put_device(&vu_dev->vdev.dev); + return rc; + } vu_dev->registered = 1; - return rc; + return 0; error_init: os_close_file(vu_dev->sock); From df447a3b4a4b961c9979b4b3ffb74317394b9b40 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 1 Sep 2025 08:27:15 +0800 Subject: [PATCH 238/450] um: Fix FD copy size in os_rcv_fd_msg() When copying FDs, the copy size should not include the control message header (cmsghdr). Fix it. Fixes: 5cde6096a4dd ("um: generalize os_rcv_fd") Signed-off-by: Tiwei Bie Signed-off-by: Johannes Berg --- arch/um/os-Linux/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 617886d1fb1e..21f0e50fb1df 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -535,7 +535,7 @@ ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, cmsg->cmsg_type != SCM_RIGHTS) return n; - memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len); + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len - CMSG_LEN(0)); return n; } From 4de37a48b6b58faaded9eb765047cf0d8785ea18 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 29 Aug 2025 11:03:44 +0200 Subject: [PATCH 239/450] drm/mediatek: fix potential OF node use-after-free The for_each_child_of_node() helper drops the reference it takes to each node as it iterates over children and an explicit of_node_put() is only needed when exiting the loop early. Drop the recently introduced bogus additional reference count decrement at each iteration that could potentially lead to a use-after-free. Fixes: 1f403699c40f ("drm/mediatek: Fix device/node reference count leaks in mtk_drm_get_all_drm_priv") Cc: Ma Ke Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Reviewed-by: CK Hu Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20250829090345.21075-2-johan@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index f8a817689e16..76719eb5db09 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -387,11 +387,11 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) of_id = of_match_node(mtk_drm_of_ids, node); if (!of_id) - goto next_put_node; + continue; pdev = of_find_device_by_node(node); if (!pdev) - goto next_put_node; + continue; drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match); if (!drm_dev) @@ -417,11 +417,10 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) next_put_device_pdev_dev: put_device(&pdev->dev); -next_put_node: - of_node_put(node); - - if (cnt == MAX_CRTC) + if (cnt == MAX_CRTC) { + of_node_put(node); break; + } } if (drm_priv->data->mmsys_dev_num == cnt) { From 9ba2556cef1df746fad4d691c8290e235b23c7d1 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 29 Aug 2025 11:03:45 +0200 Subject: [PATCH 240/450] drm/mediatek: clean up driver data initialisation The platform and drm devices are only used to look up the drm device and its driver data respectively when initialising the driver data during bind(). Drop the reference counts as soon as they have been used to make the code more readable. Note that the crtc count is never incremented on lookup failures. Signed-off-by: Johan Hovold Reviewed-by: CK Hu Reviewed-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20250829090345.21075-3-johan@kernel.org/ Signed-off-by: Chun-Kuang Hu --- drivers/gpu/drm/mediatek/mtk_drm_drv.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_drm_drv.c b/drivers/gpu/drm/mediatek/mtk_drm_drv.c index 76719eb5db09..eb5537f0ac90 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_drv.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_drv.c @@ -394,12 +394,14 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) continue; drm_dev = device_find_child(&pdev->dev, NULL, mtk_drm_match); + put_device(&pdev->dev); if (!drm_dev) - goto next_put_device_pdev_dev; + continue; temp_drm_priv = dev_get_drvdata(drm_dev); + put_device(drm_dev); if (!temp_drm_priv) - goto next_put_device_drm_dev; + continue; if (temp_drm_priv->data->main_len) all_drm_priv[CRTC_MAIN] = temp_drm_priv; @@ -411,12 +413,6 @@ static bool mtk_drm_get_all_drm_priv(struct device *dev) if (temp_drm_priv->mtk_drm_bound) cnt++; -next_put_device_drm_dev: - put_device(drm_dev); - -next_put_device_pdev_dev: - put_device(&pdev->dev); - if (cnt == MAX_CRTC) { of_node_put(node); break; From 9c600589e14f5fc01b8be9a5d0ad1f094b8b304b Mon Sep 17 00:00:00 2001 From: James Guan Date: Wed, 10 Sep 2025 19:19:29 +0800 Subject: [PATCH 241/450] wifi: virt_wifi: Fix page fault on connect This patch prevents page fault in __cfg80211_connect_result()[1] when connecting a virt_wifi device, while ensuring that virt_wifi can connect properly. [1] https://lore.kernel.org/linux-wireless/20250909063213.1055024-1-guan_yufei@163.com/ Closes: https://lore.kernel.org/linux-wireless/20250909063213.1055024-1-guan_yufei@163.com/ Signed-off-by: James Guan Link: https://patch.msgid.link/20250910111929.137049-1-guan_yufei@163.com [remove irrelevant network-manager instructions] Signed-off-by: Johannes Berg --- drivers/net/wireless/virtual/virt_wifi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/virtual/virt_wifi.c b/drivers/net/wireless/virtual/virt_wifi.c index 1fffeff2190c..4eae89376feb 100644 --- a/drivers/net/wireless/virtual/virt_wifi.c +++ b/drivers/net/wireless/virtual/virt_wifi.c @@ -277,7 +277,9 @@ static void virt_wifi_connect_complete(struct work_struct *work) priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ - cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, + cfg80211_connect_result(priv->upperdev, + priv->is_connected ? fake_router_bssid : NULL, + NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } From 87b90cee22d8658a69c0fbd43633839b75f8f05f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 2 Sep 2025 21:02:22 +0200 Subject: [PATCH 242/450] MAINTAINERS: drm-misc: fix X: entries for nova/nouveau Nouveau patches usually flow through the drm-misc tree, while nova (and nova-core) are maintained through a dedicated driver tree and soon through drm-rust. Hence, fix up the corresponding X: entries to list nova instead of nouveau. Reported-by: Maxime Ripard Closes: https://lore.kernel.org/dri-devel/enuksb2qk5wyrilz3l2vnog45lghgmplrav5to6pd5k5owi36h@pxdq6y5dpgpt/ Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20250902190247.435340-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f94e115a8d32..ed7e06ca1d05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8091,7 +8091,7 @@ X: drivers/gpu/drm/i915/ X: drivers/gpu/drm/kmb/ X: drivers/gpu/drm/mediatek/ X: drivers/gpu/drm/msm/ -X: drivers/gpu/drm/nouveau/ +X: drivers/gpu/drm/nova/ X: drivers/gpu/drm/radeon/ X: drivers/gpu/drm/tegra/ X: drivers/gpu/drm/xe/ From a3967baad4d533dc254c31e0d221e51c8d223d58 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 Sep 2025 23:26:12 +0000 Subject: [PATCH 243/450] tcp_bpf: Call sk_msg_free() when tcp_bpf_send_verdict() fails to allocate psock->cork. syzbot reported the splat below. [0] The repro does the following: 1. Load a sk_msg prog that calls bpf_msg_cork_bytes(msg, cork_bytes) 2. Attach the prog to a SOCKMAP 3. Add a socket to the SOCKMAP 4. Activate fault injection 5. Send data less than cork_bytes At 5., the data is carried over to the next sendmsg() as it is smaller than the cork_bytes specified by bpf_msg_cork_bytes(). Then, tcp_bpf_send_verdict() tries to allocate psock->cork to hold the data, but this fails silently due to fault injection + __GFP_NOWARN. If the allocation fails, we need to revert the sk->sk_forward_alloc change done by sk_msg_alloc(). Let's call sk_msg_free() when tcp_bpf_send_verdict fails to allocate psock->cork. The "*copied" also needs to be updated such that a proper error can be returned to the caller, sendmsg. It fails to allocate psock->cork. Nothing has been corked so far, so this patch simply sets "*copied" to 0. [0]: WARNING: net/ipv4/af_inet.c:156 at inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156, CPU#1: syz-executor/5983 Modules linked in: CPU: 1 UID: 0 PID: 5983 Comm: syz-executor Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156 Code: 0f 0b 90 e9 62 fe ff ff e8 7a db b5 f7 90 0f 0b 90 e9 95 fe ff ff e8 6c db b5 f7 90 0f 0b 90 e9 bb fe ff ff e8 5e db b5 f7 90 <0f> 0b 90 e9 e1 fe ff ff 89 f9 80 e1 07 80 c1 03 38 c1 0f 8c 9f fc RSP: 0018:ffffc90000a08b48 EFLAGS: 00010246 RAX: ffffffff8a09d0b2 RBX: dffffc0000000000 RCX: ffff888024a23c80 RDX: 0000000000000100 RSI: 0000000000000fff RDI: 0000000000000000 RBP: 0000000000000fff R08: ffff88807e07c627 R09: 1ffff1100fc0f8c4 R10: dffffc0000000000 R11: ffffed100fc0f8c5 R12: ffff88807e07c380 R13: dffffc0000000000 R14: ffff88807e07c60c R15: 1ffff1100fc0f872 FS: 00005555604c4500(0000) GS:ffff888125af1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005555604df5c8 CR3: 0000000032b06000 CR4: 00000000003526f0 Call Trace: __sk_destruct+0x86/0x660 net/core/sock.c:2339 rcu_do_batch kernel/rcu/tree.c:2605 [inline] rcu_core+0xca8/0x1770 kernel/rcu/tree.c:2861 handle_softirqs+0x286/0x870 kernel/softirq.c:579 __do_softirq kernel/softirq.c:613 [inline] invoke_softirq kernel/softirq.c:453 [inline] __irq_exit_rcu+0xca/0x1f0 kernel/softirq.c:680 irq_exit_rcu+0x9/0x30 kernel/softirq.c:696 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1052 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1052 Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Reported-by: syzbot+4cabd1d2fa917a456db8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c0b6b5.050a0220.3c6139.0013.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250909232623.4151337-1-kuniyu@google.com --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4..a268e1595b22 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; From d013ebc3499fd87cb9dee1dafd0c58aeb05c27c1 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 10 Sep 2025 16:56:06 +0200 Subject: [PATCH 244/450] selftests: can: enable CONFIG_CAN_VCAN as a module A proper kernel configuration for running kselftest can be obtained with: $ yes | make kselftest-merge Build of 'vcan' driver is currently missing, while the other required knobs are already there because of net/link_netns.py [1]. Add a config file in selftests/net/can to store the minimum set of kconfig needed for CAN selftests. [1] https://patch.msgid.link/20250219125039.18024-14-shaw.leon@gmail.com Fixes: 77442ffa83e8 ("selftests: can: Import tst-filter from can-tests") Reviewed-by: Vincent Mailhol Signed-off-by: Davide Caratti Link: https://patch.msgid.link/fa4c0ea262ec529f25e5f5aa9269d84764c67321.1757516009.git.dcaratti@redhat.com Signed-off-by: Marc Kleine-Budde --- tools/testing/selftests/net/can/config | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 tools/testing/selftests/net/can/config diff --git a/tools/testing/selftests/net/can/config b/tools/testing/selftests/net/can/config new file mode 100644 index 000000000000..188f79796670 --- /dev/null +++ b/tools/testing/selftests/net/can/config @@ -0,0 +1,3 @@ +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VCAN=m From 7fcbe5b2c6a4b5407bf2241fdb71e0a390f6ab9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Aug 2025 23:07:24 +0900 Subject: [PATCH 245/450] can: j1939: implement NETDEV_UNREGISTER notification handler syzbot is reporting unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 problem, for j1939 protocol did not have NETDEV_UNREGISTER notification handler for undoing changes made by j1939_sk_bind(). Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But we need to call j1939_priv_put() against an extra ref held by j1939_sk_bind() call (as a part of undoing changes made by j1939_sk_bind()) as soon as NETDEV_UNREGISTER notification fires (i.e. before j1939_sk_sock_destruct() is called via j1939_sk_release()). Otherwise, the extra ref on "struct j1939_priv" held by j1939_sk_bind() call prevents "struct net_device" from dropping the usage count to 1; making it impossible for unregister_netdevice() to continue. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 Tested-by: syzbot Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/ac9db9a4-6c30-416e-8b94-96e6559d55b2@I-love.SAKURA.ne.jp [mkl: remove space in front of label] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/main.c | 3 +++ net/can/j1939/socket.c | 49 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111..81f58924b4ac 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42..3706a872ecaf 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9..70ebc861ea2a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1300,6 +1300,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { From f214744c8a27c3c1da6b538c232da22cd027530e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:30:09 +0900 Subject: [PATCH 246/450] can: j1939: j1939_sk_bind(): call j1939_priv_put() immediately when j1939_local_ecu_get() failed Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But a refcount leak will happen when j1939_sk_bind() is called again after j1939_local_ecu_get() from previous j1939_sk_bind() call returned an error. We need to call j1939_priv_put() before j1939_sk_bind() returns an error. Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/4f49a1bc-a528-42ad-86c0-187268ab6535@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 70ebc861ea2a..88e7160d4248 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } From 06e02da29f6f1a45fc07bd60c7eaf172dc21e334 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:27:40 +0900 Subject: [PATCH 247/450] can: j1939: j1939_local_ecu_get(): undo increment when j1939_local_ecu_get() fails Since j1939_sk_bind() and j1939_sk_release() call j1939_local_ecu_put() when J1939_SOCK_BOUND was already set, but the error handling path for j1939_sk_bind() will not set J1939_SOCK_BOUND when j1939_local_ecu_get() fails, j1939_local_ecu_get() needs to undo priv->ents[sa].nusers++ when j1939_local_ecu_get() returns an error. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/e7f80046-4ff7-4ce2-8ad8-7c3c678a42c9@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed8..797719cb227e 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ From ef79f00be72bd81d2e1e6f060d83cf7e425deee4 Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Fri, 22 Aug 2025 12:50:02 +0300 Subject: [PATCH 248/450] can: xilinx_can: xcan_write_frame(): fix use-after-free of transmitted SKB can_put_echo_skb() takes ownership of the SKB and it may be freed during or after the call. However, xilinx_can xcan_write_frame() keeps using SKB after the call. Fix that by only calling can_put_echo_skb() after the code is done touching the SKB. The tx_lock is held for the entire xcan_write_frame() execution and also on the can_get_echo_skb() side so the order of operations does not matter. An earlier fix commit 3d3c817c3a40 ("can: xilinx_can: Fix usage of skb memory") did not move the can_put_echo_skb() call far enough. Signed-off-by: Anssi Hannula Fixes: 1598efe57b3e ("can: xilinx_can: refactor code in preparation for CAN FD support") Link: https://patch.msgid.link/20250822095002.168389-1-anssi.hannula@bitwise.fi [mkl: add "commit" in front of sha1 in patch description] [mkl: fix indention] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/xilinx_can.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 81baec8eb1e5..a25a3ca62c12 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -690,14 +690,6 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, dlc |= XCAN_DLCR_EDL_MASK; } - if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && - (priv->devtype.flags & XCAN_FLAG_TXFEMP)) - can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); - else - can_put_echo_skb(skb, ndev, 0, 0); - - priv->tx_head++; - priv->write_reg(priv, XCAN_FRAME_ID_OFFSET(frame_offset), id); /* If the CAN frame is RTR frame this write triggers transmission * (not on CAN FD) @@ -730,6 +722,14 @@ static void xcan_write_frame(struct net_device *ndev, struct sk_buff *skb, data[1]); } } + + if (!(priv->devtype.flags & XCAN_FLAG_TX_MAILBOXES) && + (priv->devtype.flags & XCAN_FLAG_TXFEMP)) + can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max, 0); + else + can_put_echo_skb(skb, ndev, 0, 0); + + priv->tx_head++; } /** From 5c793afa07da6d2d4595f6c73a2a543a471bb055 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 14 Aug 2025 13:26:37 +0200 Subject: [PATCH 249/450] can: rcar_can: rcar_can_resume(): fix s2ram with PSCI On R-Car Gen3 using PSCI, s2ram powers down the SoC. After resume, the CAN interface no longer works, until it is brought down and up again. Fix this by calling rcar_can_start() from the PM resume callback, to fully initialize the controller instead of just restarting it. Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/699b2f7fcb60b31b6f976a37f08ce99c5ffccb31.1755165227.git.geert+renesas@glider.be Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/rcar_can.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 64e664f5adcc..87c134bcd48d 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -861,7 +861,6 @@ static int rcar_can_resume(struct device *dev) { struct net_device *ndev = dev_get_drvdata(dev); struct rcar_can_priv *priv = netdev_priv(ndev); - u16 ctlr; int err; if (!netif_running(ndev)) @@ -873,12 +872,7 @@ static int rcar_can_resume(struct device *dev) return err; } - ctlr = readw(&priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_SLPM; - writew(ctlr, &priv->regs->ctlr); - ctlr &= ~RCAR_CAN_CTLR_CANM; - writew(ctlr, &priv->regs->ctlr); - priv->can.state = CAN_STATE_ERROR_ACTIVE; + rcar_can_start(ndev); netif_device_attach(ndev); netif_start_queue(ndev); From 25fbbaf515acd13399589bd5ee6de5f35740cef2 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 31 Aug 2025 01:08:56 +0800 Subject: [PATCH 250/450] clk: sunxi-ng: mp: Fix dual-divider clock rate readback When dual-divider clock support was introduced, the P divider offset was left out of the .recalc_rate readback function. This causes the clock rate to become bogus or even zero (possibly due to the P divider being 1, leading to a divide-by-zero). Fix this by incorporating the P divider offset into the calculation. Fixes: 45717804b75e ("clk: sunxi-ng: mp: introduce dual-divider clock") Reviewed-by: Andre Przywara Link: https://patch.msgid.link/20250830170901.1996227-4-wens@kernel.org Signed-off-by: Chen-Yu Tsai --- drivers/clk/sunxi-ng/ccu_mp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu_mp.c b/drivers/clk/sunxi-ng/ccu_mp.c index 354c981943b6..4221b1888b38 100644 --- a/drivers/clk/sunxi-ng/ccu_mp.c +++ b/drivers/clk/sunxi-ng/ccu_mp.c @@ -185,7 +185,7 @@ static unsigned long ccu_mp_recalc_rate(struct clk_hw *hw, p &= (1 << cmp->p.width) - 1; if (cmp->common.features & CCU_FEATURE_DUAL_DIV) - rate = (parent_rate / p) / m; + rate = (parent_rate / (p + cmp->p.offset)) / m; else rate = (parent_rate >> p) / m; From 5e13f2c491a4100d208e77e92fe577fe3dbad6c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:45:21 +0200 Subject: [PATCH 251/450] netfilter: nft_set_bitmap: fix lockdep splat due to missing annotation Running new 'set_flush_add_atomic_bitmap' test case for nftables.git with CONFIG_PROVE_RCU_LIST=y yields: net/netfilter/nft_set_bitmap.c:231 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/4008: #0: ffff888147f79cd8 (&nft_net->commit_mutex){+.+.}-{4:4}, at: nf_tables_valid_genid+0x2f/0xd0 lockdep_rcu_suspicious+0x116/0x160 nft_bitmap_walk+0x22d/0x240 nf_tables_delsetelem+0x1010/0x1a00 .. This is a false positive, the list cannot be altered while the transaction mutex is held, so pass the relevant argument to the iterator. Fixes tag intentionally wrong; no point in picking this up if earlier false-positive-fixups were not applied. Fixes: 28b7a6b84c0a ("netfilter: nf_tables: avoid false-positive lockdep splats in set walker") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; From c4eaca2e1052adfd67bed0a36a9d4b8e515666e4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:18 +0200 Subject: [PATCH 252/450] netfilter: nft_set_pipapo: don't check genbit from packetpath lookups The pipapo set type is special in that it has two copies of its datastructure: one live copy containing only valid elements and one on-demand clone used during transaction where adds/deletes happen. This clone is not visible to the datapath. This is unlike all other set types in nftables, those all link new elements into their live hlist/tree. For those sets, the lookup functions must skip the new elements while the transaction is ongoing to ensure consistency. As the clone is shallow, removal does have an effect on the packet path: once the transaction enters the commit phase the 'gencursor' bit that determines which elements are active and which elements should be ignored (because they are no longer valid) is flipped. This causes the datapath lookup to ignore these elements if they are found during lookup. This opens up a small race window where pipapo has an inconsistent view of the dataset from when the transaction-cpu flipped the genbit until the transaction-cpu calls nft_pipapo_commit() to swap live/clone pointers: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit removes elements from the clone so they won't be found anymore B) lookup in datastructure can't see new elements yet, but old elements are ignored -> Only matches elements that were not changed in the transaction II) calls nft_pipapo_commit(), clone and live pointers are swapped. C New nft_lookup happening now will find matching elements. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. At the same time, r1-r3 is not visible yet, because it can only be found in the clone. The situation persists for all lookups until after cpu0 hits II). The fix is easy: Don't check the genbit from pipapo lookup functions. This is possible because unlike the other set types, the new elements are not reachable from the live copy of the dataset. The clone/live pointer swap is enough to avoid matching on old elements while at the same time all new elements are exposed in one go. After this change, step B above returns a match in r1-r2. This is fine: r1-r2 only becomes truly invalid the moment they get freed. This happens after a synchronize_rcu() call and rcu read lock is held via netfilter hook traversal (nf_hook_slow()). Cc: Stefano Brivio Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++++++-- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd..793790d79d13 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -510,6 +510,23 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -518,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf..c0884fa68c79 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1152,7 +1152,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; @@ -1248,8 +1247,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, if (last) { const struct nft_set_ext *e = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + if (unlikely(nft_set_elem_expired(e))) goto next_match; ext = e; From a60f7bf4a1524d8896b76ba89623080aebf44272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:19 +0200 Subject: [PATCH 253/450] netfilter: nft_set_rbtree: continue traversal if element is inactive When the rbtree lookup function finds a match in the rbtree, it sets the range start interval to a potentially inactive element. Then, after tree lookup, if the matching element is inactive, it returns NULL and suppresses a matching result. This is wrong and leads to false negative matches when a transaction has already entered the commit phase. cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit B) finds matching range C) returns no match: found range invalid in new generation II) removes old elements from the tree C New nft_lookup happening now will find matching element, because it is no longer obscured by old, inactive one. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. It does NOT test for further matches. The situation persists for all lookups until after cpu0 hits II) after which r1-r3 range start node is tested for the first time. Move the "interval start is valid" check ahead so that tree traversal continues if the starting interval is not valid in this generation. Thanks to Stefan Hanreich for providing an initial reproducer for this bug. Reported-by: Stefan Hanreich Fixes: c1eda3c6394f ("netfilter: nft_rbtree: ignore inactive matching element with no descendants") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b1f04168ec93 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; From 64102d9bbc3d41dac5188b8fba75b1344c438970 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:20 +0200 Subject: [PATCH 254/450] netfilter: nf_tables: place base_seq in struct net This will soon be read from packet path around same time as the gencursor. Both gencursor and base_seq get incremented almost at the same time, so it makes sense to place them in the same structure. This doesn't increase struct net size on 64bit due to padding. Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 - include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 65 ++++++++++++++++--------------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..3faa80f5d801 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1912,7 +1912,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5..99dd166c5d07 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de09656..9518b50695ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + WRITE_ONCE(net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11182,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11507,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12144,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); From 11fe5a82e53ac3581a80c88e0e35fb8a80e15f48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:21 +0200 Subject: [PATCH 255/450] netfilter: nf_tables: make nft_set_do_lookup available unconditionally This function was added for retpoline mitigation and is replaced by a static inline helper if mitigations are not enabled. Enable this helper function unconditionally so next patch can add a lookup restart mechanism to fix possible false negatives while transactions are in progress. Adding lookup restarts in nft_lookup_eval doesn't work as nft_objref would then need the same copypaste loop. This patch is separate to ease review of the actual bug fix. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 10 ++-------- net/netfilter/nft_lookup.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..656e784714f3 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, const struct nft_set_ext * nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key); +#endif + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key); -#else -static inline const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) -{ - return set->ops->lookup(net, set, key); -} -#endif /* called from nft_pipapo_avx2.c */ const struct nft_set_ext * diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..2c6909bf1b40 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,17 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + return __nft_set_do_lookup(net, set, key); +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, From b2f742c846cab9afc5953a5d8f17b54922dcc723 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:22 +0200 Subject: [PATCH 256/450] netfilter: nf_tables: restart set lookup on base_seq change The hash, hash_fast, rhash and bitwise sets may indicate no result even though a matching element exists during a short time window while other cpu is finalizing the transaction. This happens when the hash lookup/bitwise lookup function has picked up the old genbit, right before it was toggled by nf_tables_commit(), but then the same cpu managed to unlink the matching old element from the hash table: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: A) observes old genbit increments base_seq I) increments the genbit II) removes old element from the set B) finds matching element C) returns no match: found element is not valid in old generation Next lookup observes new genbit and finds matching e2. Consider a packet matching element e1, e2. cpu0 processes following transaction: 1. remove e1 2. adds e2, which has same key as e1. P matches both e1 and e2. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 observed the old genbit. e2 will not be considered once it is found. The element e1 is not found anymore if cpu0 managed to unlink it from the hlist before cpu1 found it during list traversal. The situation only occurs for a brief time period, lookups happening after I) observe new genbit and return e2. This problem exists in all set types except nft_set_pipapo, so fix it once in nft_lookup rather than each set ops individually. Sample the base sequence counter, which gets incremented right before the genbit is changed. Then, if no match is found, retry the lookup if the base sequence was altered in between. If the base sequence hasn't changed: - No update took place: no-match result is expected. This is the common case. or: - nf_tables_commit() hasn't progressed to genbit update yet. Old elements were still visible and nomatch result is expected, or: - nf_tables_commit updated the genbit: We picked up the new base_seq, so the lookup function also picked up the new genbit, no-match result is expected. If the old genbit was observed, then nft_lookup also picked up the old base_seq: nft_lookup_should_retry() returns true and relookup is performed in the new generation. This problem was added when the unconditional synchronize_rcu() call that followed the current/next generation bit toggle was removed. Thanks to Pablo Neira Ayuso for reviewing an earlier version of this patchset, for suggesting re-use of existing base_seq and placement of the restart loop in nft_set_do_lookup(). Fixes: 0cbc06b3faba ("netfilter: nf_tables: remove synchronize_rcu in commit phase") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 ++- net/netfilter/nft_lookup.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9518b50695ba..c3c73411c40c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10973,7 +10973,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++base_seq == 0) ; - WRITE_ONCE(net->nft.base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 2c6909bf1b40..58c5b14889c4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -55,11 +55,40 @@ __nft_set_do_lookup(const struct net *net, const struct nft_set *set, return set->ops->lookup(net, set, key); } +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { - return __nft_set_do_lookup(net, set, key); + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); From 37a9675e61a2a2a721a28043ffdf2c8ec81eba37 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 23:52:31 +0200 Subject: [PATCH 257/450] MAINTAINERS: add Phil as netfilter reviewer Phil has contributed to netfilter with features, fixes and patch reviews for a long time. Make this more formal and add Reviewer tag. Acked-by: Jozsef Kadlecsik Acked-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2df02e4374ed..ba11421c33e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17480,6 +17480,7 @@ NETFILTER M: Pablo Neira Ayuso M: Jozsef Kadlecsik M: Florian Westphal +R: Phil Sutter L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org S: Maintained From 449c9c02537a146ac97ef962327a221e21c9cab3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Sep 2025 11:41:59 +0200 Subject: [PATCH 258/450] PM: hibernate: Restrict GFP mask in hibernation_snapshot() Commit 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") incorrectly removed a pm_restrict_gfp_mask() call from hibernation_snapshot(), so memory allocations involving swap are not prevented from being carried out in this code path any more which may lead to serious breakage. The symptoms of such breakage have become visible after adding a shrink_shmem_memory() call to hibernation_snapshot() in commit 2640e819474f ("PM: hibernate: shrink shmem pages after dev_pm_ops.prepare()") which caused this problem to be much more likely to manifest itself. However, since commit 2640e819474f was initially present in the DRM tree that did not include commit 12ffc3b1513e, the symptoms of this issue were not visible until merge commit 260f6f4fda93 ("Merge tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel") that exposed it through an entirely reasonable merge conflict resolution. Fixes: 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220555 Reported-by: Todd Brandt Tested-by: Todd Brandt Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f1f30cca573..2f66ab453823 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode) shrink_shmem_memory(); console_suspend_all(); + pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); From d02e48830e3fce9701265f6c5a58d9bdaf906a76 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 25 Aug 2025 18:44:28 +0200 Subject: [PATCH 259/450] KVM: SVM: Sync TPR from LAPIC into VMCB::V_TPR even if AVIC is active Commit 3bbf3565f48c ("svm: Do not intercept CR8 when enable AVIC") inhibited pre-VMRUN sync of TPR from LAPIC into VMCB::V_TPR in sync_lapic_to_cr8() when AVIC is active. AVIC does automatically sync between these two fields, however it does so only on explicit guest writes to one of these fields, not on a bare VMRUN. This meant that when AVIC is enabled host changes to TPR in the LAPIC state might not get automatically copied into the V_TPR field of VMCB. This is especially true when it is the userspace setting LAPIC state via KVM_SET_LAPIC ioctl() since userspace does not have access to the guest VMCB. Practice shows that it is the V_TPR that is actually used by the AVIC to decide whether to issue pending interrupts to the CPU (not TPR in TASKPRI), so any leftover value in V_TPR will cause serious interrupt delivery issues in the guest when AVIC is enabled. Fix this issue by doing pre-VMRUN TPR sync from LAPIC into VMCB::V_TPR even when AVIC is enabled. Fixes: 3bbf3565f48c ("svm: Do not intercept CR8 when enable AVIC") Cc: stable@vger.kernel.org Signed-off-by: Maciej S. Szmigiero Reviewed-by: Naveen N Rao (AMD) Link: https://lore.kernel.org/r/c231be64280b1461e854e1ce3595d70cde3a2e9d.1756139678.git.maciej.szmigiero@oracle.com [sean: tag for stable@] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9931c6c4bc6..1bfebe40854f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4046,8 +4046,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (nested_svm_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; cr8 = kvm_get_cr8(vcpu); From cdbc9836c7afadad68f374791738f118263c5371 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 3 Jul 2025 12:10:50 +0200 Subject: [PATCH 260/450] libceph: fix invalid accesses to ceph_connection_v1_info There is a place where generic code in messenger.c is reading and another place where it is writing to con->v1 union member without checking that the union member is active (i.e. msgr1 is in use). On 64-bit systems, con->v1.auth_retry overlaps with con->v2.out_iter, so such a read is almost guaranteed to return a bogus value instead of 0 when msgr2 is in use. This ends up being fairly benign because the side effect is just the invalidation of the authorizer and successive fetching of new tickets. con->v1.connect_seq overlaps with con->v2.conn_bufs and the fact that it's being written to can cause more serious consequences, but luckily it's not something that happens often. Cc: stable@vger.kernel.org Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Signed-off-by: Ilya Dryomov Reviewed-by: Viacheslav Dubeyko --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1b5705dc0c6..9f6d860411cb 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->v1.auth_retry) { + if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); @@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { - dout("clear_standby %p and ++connect_seq\n", con); + dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; - con->v1.connect_seq++; + if (!ceph_msgr2(from_msgr(con->msgr))) + con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } From e25ddfb388c8b7e5f20e3bf38d627fb485003781 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:39 +0800 Subject: [PATCH 261/450] bpf: Reject bpf_timer for PREEMPT_RT When enable CONFIG_PREEMPT_RT, the kernel will warn when run timer selftests by './test_progs -t timer': BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 In order to avoid such warning, reject bpf_timer in verifier when PREEMPT_RT is enabled. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c89e2b1bc644..9fb1f957a093 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; From fbdd61c94bcb09b0c0eb0655917bf4193d07aac1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:40 +0800 Subject: [PATCH 262/450] selftests/bpf: Skip timer cases when bpf_timer is not supported When enable CONFIG_PREEMPT_RT, verifier will reject bpf_timer with returning -EOPNOTSUPP. Therefore, skip test cases when errno is EOPNOTSUPP. cd tools/testing/selftests/bpf ./test_progs -t timer 125 free_timer:SKIP 456 timer:SKIP 457/1 timer_crash/array:SKIP 457/2 timer_crash/hash:SKIP 457 timer_crash:SKIP 458 timer_lockup:SKIP 459 timer_mim:SKIP Summary: 5/0 PASSED, 6 SKIPPED, 0 FAILED Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/free_timer.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_crash.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_lockup.c | 4 ++++ tools/testing/selftests/bpf/prog_tests/timer_mim.c | 4 ++++ 5 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/free_timer.c b/tools/testing/selftests/bpf/prog_tests/free_timer.c index b7b77a6b2979..0de8facca4c5 100644 --- a/tools/testing/selftests/bpf/prog_tests/free_timer.c +++ b/tools/testing/selftests/bpf/prog_tests/free_timer.c @@ -124,6 +124,10 @@ void test_free_timer(void) int err; skel = free_timer__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "open_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index d66687f1ee6a..56f660ca567b 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -86,6 +86,10 @@ void serial_test_timer(void) int err; timer_skel = timer__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_crash.c b/tools/testing/selftests/bpf/prog_tests/timer_crash.c index f74b82305da8..b841597c8a3a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_crash.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_crash.c @@ -12,6 +12,10 @@ static void test_timer_crash_mode(int mode) struct timer_crash *skel; skel = timer_crash__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_crash__open_and_load")) return; skel->bss->pid = getpid(); diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c index 1a2f99596916..eb303fa1e09a 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -59,6 +59,10 @@ void test_timer_lockup(void) } skel = timer_lockup__open_and_load(); + if (!skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) return; diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c index 9ff7843909e7..c930c7d7105b 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer_mim.c +++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c @@ -65,6 +65,10 @@ void serial_test_timer_mim(void) goto cleanup; timer_skel = timer_mim__open_and_load(); + if (!timer_skel && errno == EOPNOTSUPP) { + test__skip(); + return; + } if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) goto cleanup; From 6fef6ae764be8a77f61ad3b6937ba82fe8358045 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sun, 7 Sep 2025 21:43:20 +0100 Subject: [PATCH 263/450] net: ethtool: fix wrong type used in struct kernel_ethtool_ts_info In C, enumerated types do not have a defined size, apart from being compatible with one of the standard types. This allows an ABI / compiler to choose the type of an enum depending on the values it needs to store, and storing larger values in it can lead to undefined behaviour. The tx_type and rx_filters members of struct kernel_ethtool_ts_info are defined as enumerated types, but are bit arrays, where each bit is defined by the enumerated type. This means they typically store values in excess of the maximum value of the enumerated type, in fact (1 << max_value) and thus must not be declared using the enumated type. Fix both of these to use u32, as per the corresponding __u32 UAPI type. Fixes: 2111375b85ad ("net: Add struct kernel_ethtool_ts_info") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uvMEK-00000003Amd-2pWR@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index de5bd76a400c..d7d757e72554 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -856,8 +856,8 @@ struct kernel_ethtool_ts_info { enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_source phc_source; int phc_phyindex; - enum hwtstamp_tx_types tx_types; - enum hwtstamp_rx_filters rx_filters; + u32 tx_types; + u32 rx_filters; }; /** From 5537a4679403423e0b49c95b619983a4583d69c5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 8 Sep 2025 13:26:19 +0200 Subject: [PATCH 264/450] net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop phylink_{suspend,resume}() from ax88772 PM callbacks. MDIO bus accesses have their own runtime-PM handling and will try to wake the device if it is suspended. Such wake attempts must not happen from PM callbacks while the device PM lock is held. Since phylink {sus|re}sume may trigger MDIO, it must not be called in PM context. No extra phylink PM handling is required for this driver: - .ndo_open/.ndo_stop control the phylink start/stop lifecycle. - ethtool/phylib entry points run in process context, not PM. - phylink MAC ops program the MAC on link changes after resume. Fixes: e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") Reported-by: Hubert Wiśniewski Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel Tested-by: Hubert Wiśniewski Tested-by: Xu Yang Link: https://patch.msgid.link/20250908112619.2900723-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 792ddda1ad49..1e8f7089f5e8 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -607,15 +607,8 @@ static const struct net_device_ops ax88772_netdev_ops = { static void ax88772_suspend(struct usbnet *dev) { - struct asix_common_private *priv = dev->driver_priv; u16 medium; - if (netif_running(dev->net)) { - rtnl_lock(); - phylink_suspend(priv->phylink, false); - rtnl_unlock(); - } - /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; @@ -644,12 +637,6 @@ static void ax88772_resume(struct usbnet *dev) for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; - - if (netif_running(dev->net)) { - rtnl_lock(); - phylink_resume(priv->phylink); - rtnl_unlock(); - } } static int asix_resume(struct usb_interface *intf) From c3f8d13357deab1e04f8a52b499d6b9b704e578e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2025 15:11:21 +0200 Subject: [PATCH 265/450] wifi: nl80211: completely disable per-link stats for now After commit 8cc71fc3b82b ("wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO"), the per-link data is only included in station dumps, where the size limit is somewhat less of an issue. However, it's still an issue, depending on how many links a station has and how much per-link data there is. Thus, for now, disable per-link statistics entirely. A complete fix will need to take this into account, make it opt-in by userspace, and change the dump format to be able to split a single station's data across multiple netlink dump messages, which all together is too much development for a fix. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f2f7424e930c..852573423e52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7575,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo, true) < 0) + &sinfo, false) < 0) goto out; sta_idx++; From 8884c693991333ae065830554b9b0c96590b1bb2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:31 +0000 Subject: [PATCH 266/450] hsr: use rtnl lock when iterating over ports hsr_for_each_port is called in many places without holding the RCU read lock, this may trigger warnings on debug kernels. Most of the callers are actually hold rtnl lock. So add a new helper hsr_for_each_port_rtnl to allow callers in suitable contexts to iterate ports safely without explicit RCU locking. This patch only fixed the callers that is hold rtnl lock. Other caller issues will be fixed in later patches. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 18 +++++++++--------- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec1..bce7b4061ce0 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -484,7 +484,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +506,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +534,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +580,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec..ac1eb1db1a52 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce019..33b0d2460c9b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ From 393c841fe4333cdd856d0ca37b066d72746cfaa6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:32 +0000 Subject: [PATCH 267/450] hsr: use hsr_for_each_port_rtnl in hsr_port_get_hsr hsr_port_get_hsr() iterates over ports using hsr_for_each_port(), but many of its callers do not hold the required RCU lock. Switch to hsr_for_each_port_rtnl(), since most callers already hold the rtnl lock. After review, all callers are covered by either the rtnl lock or the RCU lock, except hsr_dev_xmit(). Fix this by adding an RCU read lock there. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index bce7b4061ce0..702da1f9aaa9 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ac1eb1db1a52..bc94b07101d8 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; From 847748fc66d08a89135a74e29362a66ba4e3ab15 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:33 +0000 Subject: [PATCH 268/450] hsr: hold rcu and dev lock for hsr_get_port_ndev hsr_get_port_ndev calls hsr_for_each_port, which need to hold rcu lock. On the other hand, before return the port device, we need to hold the device reference to avoid UaF in the caller function. Suggested-by: Paolo Abeni Fixes: 9c10dd8eed74 ("net: hsr: Create and export hsr_get_port_ndev()") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 20 ++++++++++++++------ net/hsr/hsr_device.c | 7 ++++++- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index dadce6009791..e42d0fdefee1 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -654,7 +654,7 @@ static void icssg_prueth_hsr_fdb_add_del(struct prueth_emac *emac, static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -663,11 +663,15 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, true); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); @@ -679,7 +683,7 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -688,11 +692,15 @@ static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, false); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 702da1f9aaa9..fbbc3ccf9df6 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -675,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); From 002ebddd695a53999550e241b71950f1aa0e1ac4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:20 +0200 Subject: [PATCH 269/450] pmdomain: core: Restore behaviour for disabling unused PM domains Recent changes to genpd prevents those PM domains being powered-on during initialization from being powered-off during the boot sequence. Based upon whether CONFIG_PM_CONFIG_PM_GENERIC_DOMAINS_OF is set of not, genpd relies on the sync_state mechanism or the genpd_power_off_unused() (which is a late_initcall_sync), to understand when it's okay to allow these PM domains to be powered-off. This new behaviour in genpd has lead to problems on different platforms. Let's therefore restore the behavior of genpd_power_off_unused(). Moreover, let's introduce GENPD_FLAG_NO_STAY_ON, to allow genpd OF providers to opt-out from the new behaviour. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Heiko Stuebner Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 20 ++++++++++++++------ include/linux/pm_domain.h | 7 +++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 0006ab3d0789..61c2277c9ce3 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = { #define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW) #define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW) #define genpd_is_no_sync_state(genpd) (genpd->flags & GENPD_FLAG_NO_SYNC_STATE) +#define genpd_is_no_stay_on(genpd) (genpd->flags & GENPD_FLAG_NO_STAY_ON) static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) @@ -1357,7 +1358,6 @@ static int genpd_runtime_resume(struct device *dev) return ret; } -#ifndef CONFIG_PM_GENERIC_DOMAINS_OF static bool pd_ignore_unused; static int __init pd_ignore_unused_setup(char *__unused) { @@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void) mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) { - genpd_lock(genpd); - genpd->stay_on = false; - genpd_unlock(genpd); genpd_queue_power_off_work(genpd); } @@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void) return 0; } late_initcall_sync(genpd_power_off_unused); -#endif #ifdef CONFIG_PM_SLEEP @@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd) } } +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off; +} +#else +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = false; +} +#endif + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; - genpd->stay_on = !is_off; + genpd_set_stay_on(genpd, is_off); genpd->sync_state = GENPD_SYNC_STATE_OFF; genpd->device_count = 0; genpd->provider = NULL; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819..f67a2cb7d781 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ From 2bc12a8199a0f954d003faa4ad8d100a0b19d85a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:21 +0200 Subject: [PATCH 270/450] pmdomain: rockchip: Fix regulator dependency with GENPD_FLAG_NO_STAY_ON The deferred regulator retrieval for Rockchip PM domains are causing some weird dependencies. More precisely, if the power-domain is powered-on from the HW perspective, its corresponding regulator must not be powered-off via regulator_init_complete(), which is a late_initcall_sync. Even on platforms that don't have the domain-supply regulator specified for the power-domain provider, may suffer from these problems. More precisely, things just happen to work before, because genpd_power_off_unused() (also a late_initcall_sync) managed to power-off the PM domain before regulator_init_complete() powered-off the regulator. Ideally this fragile dependency must be fixed properly for the Rockchip PM domains, but until then, let's fallback to the previous behaviour by using the GENPD_FLAG_NO_STAY_ON flag. Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Cc: Heiko Stuebner Cc: Sebastian Reichel Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Nicolas Frattaroli Tested-by: Heiko Stuebner Acked-by: Heiko Stuebner Signed-off-by: Ulf Hansson --- drivers/pmdomain/rockchip/pm-domains.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pmdomain/rockchip/pm-domains.c b/drivers/pmdomain/rockchip/pm-domains.c index 242570c505fb..1955c6d453e4 100644 --- a/drivers/pmdomain/rockchip/pm-domains.c +++ b/drivers/pmdomain/rockchip/pm-domains.c @@ -865,7 +865,7 @@ static int rockchip_pm_add_one_domain(struct rockchip_pmu *pmu, pd->genpd.power_on = rockchip_pd_power_on; pd->genpd.attach_dev = rockchip_pd_attach_dev; pd->genpd.detach_dev = rockchip_pd_detach_dev; - pd->genpd.flags = GENPD_FLAG_PM_CLK; + pd->genpd.flags = GENPD_FLAG_PM_CLK | GENPD_FLAG_NO_STAY_ON; if (pd_info->active_wakeup) pd->genpd.flags |= GENPD_FLAG_ACTIVE_WAKEUP; pm_genpd_init(&pd->genpd, NULL, From 36bbaec460db0fb2f6d2641470b4a6f3891a492a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:22 +0200 Subject: [PATCH 271/450] pmdomain: renesas: rcar-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rcar-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-sysc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c index 2d4161170c63..d8a8ffcde38d 100644 --- a/drivers/pmdomain/renesas/rcar-sysc.c +++ b/drivers/pmdomain/renesas/rcar-sysc.c @@ -241,6 +241,7 @@ static int __init rcar_sysc_pd_setup(struct rcar_sysc_pd *pd) } } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_sysc_pd_power_off; genpd->power_on = rcar_sysc_pd_power_on; From d929e42deaf7e2d165ac702421d02eeb9d544c70 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:23 +0200 Subject: [PATCH 272/450] pmdomain: renesas: rcar-gen4-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rcar-gen4-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rcar-gen4-sysc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c index 5aa7fa1df8fe..7434bf42d215 100644 --- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c +++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c @@ -251,6 +251,7 @@ static int __init rcar_gen4_sysc_pd_setup(struct rcar_gen4_sysc_pd *pd) genpd->detach_dev = cpg_mssr_detach_dev; } + genpd->flags |= GENPD_FLAG_NO_STAY_ON; genpd->power_off = rcar_gen4_sysc_pd_power_off; genpd->power_on = rcar_gen4_sysc_pd_power_on; From 303010f4658cb134eb27cee88026fb5d065a48cd Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:24 +0200 Subject: [PATCH 273/450] pmdomain: renesas: rmobile-sysc: Don't keep unused PM domains powered-on The recent changes to genpd makes a genpd OF provider that is powered-on at initialization to stay powered-on, until the ->sync_state() callback is invoked for it. This may not happen at all, if we wait for a consumer device to be probed, leading to wasting energy. There are ways to enforce the ->sync_state() callback to be invoked, through sysfs or via the probe-defer-timeout, but none of them in its current form are a good fit for rmobile-sysc PM domains. Let's therefore opt-out from this behaviour of genpd for now, by using the GENPD_FLAG_NO_STAY_ON. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/renesas/rmobile-sysc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/renesas/rmobile-sysc.c b/drivers/pmdomain/renesas/rmobile-sysc.c index 8eedc9a1d825..a6bf7295e909 100644 --- a/drivers/pmdomain/renesas/rmobile-sysc.c +++ b/drivers/pmdomain/renesas/rmobile-sysc.c @@ -100,7 +100,8 @@ static void rmobile_init_pm_domain(struct rmobile_pm_domain *rmobile_pd) struct generic_pm_domain *genpd = &rmobile_pd->genpd; struct dev_power_governor *gov = rmobile_pd->gov; - genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP; + genpd->flags |= GENPD_FLAG_PM_CLK | GENPD_FLAG_ACTIVE_WAKEUP | + GENPD_FLAG_NO_STAY_ON; genpd->attach_dev = cpg_mstp_attach_dev; genpd->detach_dev = cpg_mstp_detach_dev; From 63a796558bc22ec699e4193d5c75534757ddf2e6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 11 Sep 2025 16:33:31 +0200 Subject: [PATCH 274/450] Revert "net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups" This reverts commit 5537a4679403 ("net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups"), it breaks operation of asix ethernet usb dongle after system suspend-resume cycle. Link: https://lore.kernel.org/all/b5ea8296-f981-445d-a09a-2f389d7f6fdd@samsung.com/ Fixes: 5537a4679403 ("net: usb: asix: ax88772: drop phylink use in PM to avoid MDIO runtime PM wakeups") Reported-by: Marek Szyprowski Acked-by: Jakub Kicinski Link: https://patch.msgid.link/2945b9dbadb8ee1fee058b19554a5cb14f1763c1.1757601118.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- drivers/net/usb/asix_devices.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 1e8f7089f5e8..792ddda1ad49 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -607,8 +607,15 @@ static const struct net_device_ops ax88772_netdev_ops = { static void ax88772_suspend(struct usbnet *dev) { + struct asix_common_private *priv = dev->driver_priv; u16 medium; + if (netif_running(dev->net)) { + rtnl_lock(); + phylink_suspend(priv->phylink, false); + rtnl_unlock(); + } + /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; @@ -637,6 +644,12 @@ static void ax88772_resume(struct usbnet *dev) for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; + + if (netif_running(dev->net)) { + rtnl_lock(); + phylink_resume(priv->phylink); + rtnl_unlock(); + } } static int asix_resume(struct usb_interface *intf) From b0035df56dcd210d735e90ecd16817693f1a2ed9 Mon Sep 17 00:00:00 2001 From: Shenghao Ding Date: Thu, 11 Sep 2025 15:11:31 +0800 Subject: [PATCH 275/450] ALSA: hda/tas2781: Fix a potential race condition that causes a NULL pointer in case no efi.get_variable exsits A a potential race condition reported by one of my customers that leads to a NULL pointer dereference, where the call to efi.get_variable should be guarded with efi_rt_services_supported() to ensure that function exists. Fixes: 4fe238513407 ("ALSA: hda/tas2781: Move and unified the calibrated-data getting function for SPI and I2C into the tas2781_hda lib") Signed-off-by: Shenghao Ding Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/tas2781_hda.c | 5 +++++ sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/sound/hda/codecs/side-codecs/tas2781_hda.c b/sound/hda/codecs/side-codecs/tas2781_hda.c index 536940c78f00..96e6d82dc69e 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda.c @@ -193,6 +193,11 @@ int tas2781_save_calibration(struct tas2781_hda *hda) efi_status_t status; int i; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + if (hda->catlog_id < LENOVO) efi_guid = tasdev_fct_efi_guid[hda->catlog_id]; diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c index 45a70fbf6205..b5b7a1e82b75 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c @@ -315,6 +315,11 @@ static int tas2563_save_calibration(struct tas2781_hda *h) unsigned int attr; int ret, i, j, k; + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + dev_err(p->dev, "%s: NO EFI FOUND!\n", __func__); + return -EINVAL; + } + cd->cali_dat_sz_per_dev = TAS2563_CAL_DATA_SIZE * TASDEV_CALIB_N; /* extra byte for each device is the device number */ From 1fcf686def19064a7b5cfaeb28c1f1a119900a2b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 12 Sep 2025 03:27:11 +0800 Subject: [PATCH 276/450] erofs: fix long xattr name prefix placement Currently, xattr name prefixes are forcibly placed into the packed inode if the fragments feature is enabled, and users have no option to put them in plain form directly on disk. This is inflexible. First, as mentioned above, users should be able to store unwrapped long xattr name prefixes unconditionally (COMPAT_PLAIN_XATTR_PFX). Second, since we now have the new metabox inode to store metadata, it should be used when available instead of the packed inode. Fixes: 414091322c63 ("erofs: implement metadata compression") Signed-off-by: Gao Xiang --- fs/erofs/erofs_fs.h | 8 +++++--- fs/erofs/internal.h | 1 + fs/erofs/xattr.c | 13 ++++++++++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 377ee12b8b96..3d5738f80072 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -12,10 +12,12 @@ /* to allow for x86 boot sectors and other oddities. */ #define EROFS_SUPER_OFFSET 1024 -#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 -#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 -#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 +#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 +#define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 +#define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 + /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ccc5f0ee8df..9319c66e86c3 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -234,6 +234,7 @@ EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) +EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index eaa9efd766ee..396536d9a862 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -482,6 +482,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb) erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; struct erofs_xattr_prefix_item *pfs; int ret = 0, i, len; + bool plain = erofs_sb_has_plain_xattr_pfx(sbi); if (!sbi->xattr_prefix_count) return 0; @@ -490,9 +491,15 @@ int erofs_xattr_prefixes_init(struct super_block *sb) if (!pfs) return -ENOMEM; - if (sbi->packed_inode) - buf.mapping = sbi->packed_inode->i_mapping; - else + if (!plain) { + if (erofs_sb_has_metabox(sbi)) + (void)erofs_init_metabuf(&buf, sb, true); + else if (sbi->packed_inode) + buf.mapping = sbi->packed_inode->i_mapping; + else + plain = true; + } + if (plain) (void)erofs_init_metabuf(&buf, sb, false); for (i = 0; i < sbi->xattr_prefix_count; i++) { From f6d2900f2806d584303db689d9e18f0443610514 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 10 Sep 2025 11:40:03 +0200 Subject: [PATCH 277/450] MAINTAINERS: Update the DMA Rust entry Update the DMA Rust maintainers entry in the following two aspects: (1) Change Abdiel's entry to 'Reviewer'. (2) Take patches through the driver-core tree. Abdiel won't do any more maintainer work on the DMA (or scatterlist) infrastructure, but he'd like to be kept in the loop, hence change is entry to 'R:'. Analogous to [1], the DMA (and scatterlist) helpers are closely coupled with the core device infrastructure and the device lifecycle, hence take patches through the driver-core tree by default. Cc: Abdiel Janulgue Link: https://lore.kernel.org/r/20250725202840.2251768-1-ojeda@kernel.org [1] Acked-by: Abdiel Janulgue Acked-by: Greg Kroah-Hartman Signed-off-by: Danilo Krummrich --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..281149d9b821 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7239,15 +7239,15 @@ F: include/linux/swiotlb.h F: kernel/dma/ DMA MAPPING HELPERS DEVICE DRIVER API [RUST] -M: Abdiel Janulgue M: Danilo Krummrich +R: Abdiel Janulgue R: Daniel Almeida R: Robin Murphy R: Andreas Hindborg L: rust-for-linux@vger.kernel.org S: Supported W: https://rust-for-linux.com -T: git https://github.com/Rust-for-Linux/linux.git alloc-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: rust/helpers/dma.c F: rust/kernel/dma.rs F: samples/rust/rust_dma.rs From 247981eecd3dd6ff51bd0a0223deba8af39c5498 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 10 Sep 2025 20:37:16 +0000 Subject: [PATCH 278/450] net: Use NAPI_* in test_bit when stopping napi kthread napi_stop_kthread waits for the NAPI_STATE_SCHED_THREADED to be unset before stopping the kthread. But it uses test_bit with the NAPIF_STATE_SCHED_THREADED and that might stop the kthread early before the flag is unset. Use the NAPI_* variant of the NAPI state bits in test_bit instead. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 689883de94dd ("net: stop napi kthreads when THREADED napi is disabled") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250910203716.1016546-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..8d49b2198d07 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); From 5577352b55833d0f4350eb5d62eda2df09e84922 Mon Sep 17 00:00:00 2001 From: Li Tian Date: Wed, 10 Sep 2025 08:37:32 +0800 Subject: [PATCH 279/450] net/mlx5: Not returning mlx5_link_info table when speed is unknown Because mlx5e_link_info and mlx5e_ext_link_info have holes e.g. Azure mlx5 reports PTYS 19. Do not return it unless speed is retrieved successfully. Fixes: 65a5d35571849 ("net/mlx5: Refactor link speed handling with mlx5_link_info struct") Suggested-by: Vitaly Kuznetsov Signed-off-by: Li Tian Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250910003732.5973-1-litian@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 2d7adf7444ba..aa9f2b0a77d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1170,7 +1170,11 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, force_legacy); i = find_first_bit(&temp, max_size); - if (i < max_size) + + /* mlx5e_link_info has holes. Check speed + * is not zero as indication of one. + */ + if (i < max_size && table[i].speed) return &table[i]; return NULL; From 2690cb089502b80b905f2abdafd1bf2d54e1abef Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 10 Sep 2025 17:48:25 +0300 Subject: [PATCH 280/450] dpaa2-switch: fix buffer pool seeding for control traffic Starting with commit c50e7475961c ("dpaa2-switch: Fix error checking in dpaa2_switch_seed_bp()"), the probing of a second DPSW object errors out like below. fsl_dpaa2_switch dpsw.1: fsl_mc_driver_probe failed: -12 fsl_dpaa2_switch dpsw.1: probe with driver fsl_dpaa2_switch failed with error -12 The aforementioned commit brought to the surface the fact that seeding buffers into the buffer pool destined for control traffic is not successful and an access violation recoverable error can be seen in the MC firmware log: [E, qbman_rec_isr:391, QBMAN] QBMAN recoverable event 0x1000000 This happens because the driver incorrectly used the ID of the DPBP object instead of the hardware buffer pool ID when trying to release buffers into it. This is because any DPSW object uses two buffer pools, one managed by the Linux driver and destined for control traffic packet buffers and the other one managed by the MC firmware and destined only for offloaded traffic. And since the buffer pool managed by the MC firmware does not have an external facing DPBP equivalent, any subsequent DPBP objects created after the first DPSW will have a DPBP id different to the underlying hardware buffer ID. The issue was not caught earlier because these two numbers can be identical when all DPBP objects are created before the DPSW objects are. This is the case when the DPL file is used to describe the entire DPAA2 object layout and objects are created at boot time and it's also true for the first DPSW being created dynamically using ls-addsw. Fix this by using the buffer pool ID instead of the DPBP id when releasing buffers into the pool. Fixes: 2877e4f7e189 ("staging: dpaa2-switch: setup buffer pool and RX path rings") Signed-off-by: Ioana Ciornei Link: https://patch.msgid.link/20250910144825.2416019-1-ioana.ciornei@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 4643a3380618..b1e1ad9e4b48 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2736,7 +2736,7 @@ static int dpaa2_switch_setup_dpbp(struct ethsw_core *ethsw) dev_err(dev, "dpsw_ctrl_if_set_pools() failed\n"); goto err_get_attr; } - ethsw->bpid = dpbp_attrs.id; + ethsw->bpid = dpbp_attrs.bpid; return 0; From 9dfec4a51df9cf0dcc23cb4ac6fc314bf9e999d0 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 10 Sep 2025 15:58:47 +0800 Subject: [PATCH 281/450] USB: core: remove the move buf action The buffer size of sysfs is fixed at PAGE_SIZE, and the page offset of the buf parameter of sysfs_emit_at() must be 0, there is no need to manually manage the buf pointer offset. Fixes: 711d41ab4a0e ("usb: core: Use sysfs_emit_at() when showing dynamic IDs") Reported-by: syzbot+b6445765657b5855e869@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b6445765657b5855e869 Tested-by: syzbot+b6445765657b5855e869@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_B32D6D8C9450EBFEEE5ACC2C7B0E6C402D0A@qq.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index c3177034b779..f441958b0ef4 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -119,11 +119,11 @@ ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf) guard(mutex)(&usb_dynids_lock); list_for_each_entry(dynid, &dynids->list, node) if (dynid->id.bInterfaceClass != 0) - count += sysfs_emit_at(&buf[count], count, "%04x %04x %02x\n", + count += sysfs_emit_at(buf, count, "%04x %04x %02x\n", dynid->id.idVendor, dynid->id.idProduct, dynid->id.bInterfaceClass); else - count += sysfs_emit_at(&buf[count], count, "%04x %04x\n", + count += sysfs_emit_at(buf, count, "%04x %04x\n", dynid->id.idVendor, dynid->id.idProduct); return count; } From 8ab2f1c35669bff7d7ed1bb16bf5cc989b3e2e17 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Tue, 26 Aug 2025 09:58:08 +0200 Subject: [PATCH 282/450] mmc: mvsdio: Fix dma_unmap_sg() nents value The dma_unmap_sg() functions should be called with the same nents as the dma_map_sg(), not the value the map function returned. Fixes: 236caa7cc351 ("mmc: SDIO driver for Marvell SoCs") Signed-off-by: Thomas Fourier Reviewed-by: Linus Walleij Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mvsdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index a9e6277789ba..79df2fa89a3f 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -292,7 +292,7 @@ static u32 mvsd_finish_data(struct mvsd_host *host, struct mmc_data *data, host->pio_ptr = NULL; host->pio_size = 0; } else { - dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags, + dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, mmc_get_dma_dir(data)); } From 7b7e71683b4ccbe0dbd7d434707623327e852f20 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:40:20 +0800 Subject: [PATCH 283/450] mmc: sdhci: Move the code related to setting the clock from sdhci_set_ios_common() into sdhci_set_ios() The sdhci_set_clock() is called in sdhci_set_ios_common() and __sdhci_uhs2_set_ios(). According to Section 3.13.2 "Card Interface Detection Sequence" of the SD Host Controller Standard Specification Version 7.00, the SD clock is supplied after power is supplied, so we only need one in __sdhci_uhs2_set_ios(). Let's move the code related to setting the clock from sdhci_set_ios_common() into sdhci_set_ios() and modify the parameters passed to sdhci_set_clock() in __sdhci_uhs2_set_ios(). Fixes: 10c8298a052b ("mmc: sdhci-uhs2: add set_ios()") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-uhs2.c | 3 ++- drivers/mmc/host/sdhci.c | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c index 0efeb9d0c376..18fb6ee5b96a 100644 --- a/drivers/mmc/host/sdhci-uhs2.c +++ b/drivers/mmc/host/sdhci-uhs2.c @@ -295,7 +295,8 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd); - sdhci_set_clock(host, host->clock); + sdhci_set_clock(host, ios->clock); + host->clock = ios->clock; } static int sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 3a17821efa5c..ac7e11f37af7 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2367,23 +2367,6 @@ void sdhci_set_ios_common(struct mmc_host *mmc, struct mmc_ios *ios) (ios->power_mode == MMC_POWER_UP) && !(host->quirks2 & SDHCI_QUIRK2_PRESET_VALUE_BROKEN)) sdhci_enable_preset_value(host, false); - - if (!ios->clock || ios->clock != host->clock) { - host->ops->set_clock(host, ios->clock); - host->clock = ios->clock; - - if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && - host->clock) { - host->timeout_clk = mmc->actual_clock ? - mmc->actual_clock / 1000 : - host->clock / 1000; - mmc->max_busy_timeout = - host->ops->get_max_timeout_count ? - host->ops->get_max_timeout_count(host) : - 1 << 27; - mmc->max_busy_timeout /= host->timeout_clk; - } - } } EXPORT_SYMBOL_GPL(sdhci_set_ios_common); @@ -2410,6 +2393,23 @@ void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) sdhci_set_ios_common(mmc, ios); + if (!ios->clock || ios->clock != host->clock) { + host->ops->set_clock(host, ios->clock); + host->clock = ios->clock; + + if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK && + host->clock) { + host->timeout_clk = mmc->actual_clock ? + mmc->actual_clock / 1000 : + host->clock / 1000; + mmc->max_busy_timeout = + host->ops->get_max_timeout_count ? + host->ops->get_max_timeout_count(host) : + 1 << 27; + mmc->max_busy_timeout /= host->timeout_clk; + } + } + if (host->ops->set_power) host->ops->set_power(host, ios->power_mode, ios->vdd); else From 09c2b628f6403ad467fc73326a50020590603871 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:41:01 +0800 Subject: [PATCH 284/450] mmc: sdhci-uhs2: Fix calling incorrect sdhci_set_clock() function Fix calling incorrect sdhci_set_clock() in __sdhci_uhs2_set_ios() when the vendor defines its own sdhci_set_clock(). Fixes: 10c8298a052b ("mmc: sdhci-uhs2: add set_ios()") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-uhs2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-uhs2.c b/drivers/mmc/host/sdhci-uhs2.c index 18fb6ee5b96a..c459a08d01da 100644 --- a/drivers/mmc/host/sdhci-uhs2.c +++ b/drivers/mmc/host/sdhci-uhs2.c @@ -295,7 +295,7 @@ static void __sdhci_uhs2_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_uhs2_set_power(host, ios->power_mode, ios->vdd); - sdhci_set_clock(host, ios->clock); + host->ops->set_clock(host, ios->clock); host->clock = ios->clock; } From 77a436c93d10d68201bfd4941d1ca3230dfd1f40 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Thu, 11 Sep 2025 10:42:42 +0800 Subject: [PATCH 285/450] mmc: sdhci-pci-gli: GL9767: Fix initializing the UHS-II interface during a power-on According to the power structure of IC hardware design for UHS-II interface, reset control and timing must be added to the initialization process of powering on the UHS-II interface. Fixes: 27dd3b82557a ("mmc: sdhci-pci-gli: enable UHS-II mode for GL9767") Cc: stable@vger.kernel.org # v6.13+ Signed-off-by: Ben Chuang Acked-by: Adrian Hunter Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-gli.c | 68 +++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 3a1de477e9af..b0f91cc9e40e 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -283,6 +283,8 @@ #define PCIE_GLI_9767_UHS2_CTL2_ZC_VALUE 0xb #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL BIT(6) #define PCIE_GLI_9767_UHS2_CTL2_ZC_CTL_VALUE 0x1 +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN BIT(13) +#define PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE BIT(14) #define GLI_MAX_TUNING_LOOP 40 @@ -1179,6 +1181,65 @@ static void gl9767_set_low_power_negotiation(struct pci_dev *pdev, bool enable) gl9767_vhs_read(pdev); } +static void sdhci_gl9767_uhs2_phy_reset(struct sdhci_host *host, bool assert) +{ + struct sdhci_pci_slot *slot = sdhci_priv(host); + struct pci_dev *pdev = slot->chip->pdev; + u32 value, set, clr; + + if (assert) { + /* Assert reset, set RESETN and clean RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + } else { + /* De-assert reset, clean RESETN and set RESETN_VALUE */ + set = PCIE_GLI_9767_UHS2_CTL2_FORCE_RESETN_VALUE; + clr = PCIE_GLI_9767_UHS2_CTL2_FORCE_PHY_RESETN; + } + + gl9767_vhs_write(pdev); + pci_read_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, &value); + value |= set; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + value &= ~clr; + pci_write_config_dword(pdev, PCIE_GLI_9767_UHS2_CTL2, value); + gl9767_vhs_read(pdev); +} + +static void __gl9767_uhs2_set_power(struct sdhci_host *host, unsigned char mode, unsigned short vdd) +{ + u8 pwr = 0; + + if (mode != MMC_POWER_OFF) { + pwr = sdhci_get_vdd_value(vdd); + if (!pwr) + WARN(1, "%s: Invalid vdd %#x\n", + mmc_hostname(host->mmc), vdd); + pwr |= SDHCI_VDD2_POWER_180; + } + + if (host->pwr == pwr) + return; + + host->pwr = pwr; + + if (pwr == 0) { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + } else { + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); + + pwr |= SDHCI_POWER_ON; + sdhci_writeb(host, pwr & 0xf, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + + /* Assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, true); + pwr |= SDHCI_VDD2_POWER_ON; + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); + usleep_range(5000, 6250); + } +} + static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) { struct sdhci_pci_slot *slot = sdhci_priv(host); @@ -1205,6 +1266,11 @@ static void sdhci_gl9767_set_clock(struct sdhci_host *host, unsigned int clock) } sdhci_enable_clk(host, clk); + + if (mmc_card_uhs2(host->mmc)) + /* De-assert reset */ + sdhci_gl9767_uhs2_phy_reset(host, false); + gl9767_set_low_power_negotiation(pdev, true); } @@ -1476,7 +1542,7 @@ static void sdhci_gl9767_set_power(struct sdhci_host *host, unsigned char mode, gl9767_vhs_read(pdev); sdhci_gli_overcurrent_event_enable(host, false); - sdhci_uhs2_set_power(host, mode, vdd); + __gl9767_uhs2_set_power(host, mode, vdd); sdhci_gli_overcurrent_event_enable(host, true); } else { gl9767_vhs_write(pdev); From 201825fb4278accb6ace42915566c22391a0900d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 11 Sep 2025 15:43:15 +0100 Subject: [PATCH 286/450] net: ethtool: handle EOPNOTSUPP from ethtool get_ts_info() method Network drivers sometimes return -EOPNOTSUPP from their get_ts_info() method, and this should not cause the reporting of PHY timestamping information to be prohibited. Handle this error code, and also arrange for ethtool_net_get_ts_info_by_phc() to return -EOPNOTSUPP when the method is not implemented. This allows e.g. PHYs connected to DSA switches which support timestamping to report their timestamping capabilities. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uwiW3-00000004jRF-3CnC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..92e6a681c797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); From a5edf3550f4260504b7e0ab3d40d13ffe924b773 Mon Sep 17 00:00:00 2001 From: hupu Date: Wed, 10 Sep 2025 16:16:55 +0800 Subject: [PATCH 287/450] perf subcmd: avoid crash in exclude_cmds when excludes is empty When cross-compiling the perf tool for ARM64, `perf help` may crash with the following assertion failure: help.c:122: exclude_cmds: Assertion `cmds->names[ci] == NULL' failed. This happens when the perf binary is not named exactly "perf" or when multiple "perf-*" binaries exist in the same directory. In such cases, the `excludes` command list can be empty, which leads to the final assertion in exclude_cmds() being triggered. Add a simple guard at the beginning of exclude_cmds() to return early if excludes->cnt is zero, preventing the crash. Signed-off-by: hupu Reported-by: Guilherme Amadio Reviewed-by: Namhyung Kim Link: https://lore.kernel.org/r/20250909094953.106706-1-amadio@gentoo.org Signed-off-by: Namhyung Kim --- tools/lib/subcmd/help.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 9ef569492560..ddaeb4eb3e24 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -75,6 +75,9 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) size_t ci, cj, ei; int cmp; + if (!excludes->cnt) + return; + ci = cj = ei = 0; while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); From 7947ad15614ce897f47ce8ae123b82445d1861d0 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 12 Sep 2025 17:01:29 -0700 Subject: [PATCH 288/450] perf lock: Provide a host_env for session new When "perf lock con" is run in a live mode, with no data file, a host environment must be provided. Testing missed this as a failing assert was creating the 1 line of expected stderr output. $ sudo perf lock con -ab true perf: util/session.c:195: __perf_session__new: Assertion `host_env != NULL' failed. Aborted Fixes: 525a599badeeafba ("perf env: Remove global perf_env") Signed-off-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/builtin-lock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fd49703021fd..078634461df2 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -2009,6 +2009,7 @@ static int __cmd_contention(int argc, const char **argv) .owner = show_lock_owner, .cgroups = RB_ROOT, }; + struct perf_env host_env; lockhash_table = calloc(LOCKHASH_SIZE, sizeof(*lockhash_table)); if (!lockhash_table) @@ -2024,7 +2025,10 @@ static int __cmd_contention(int argc, const char **argv) eops.mmap = perf_event__process_mmap; eops.tracing_data = perf_event__process_tracing_data; - session = perf_session__new(use_bpf ? NULL : &data, &eops); + perf_env__init(&host_env); + session = __perf_session__new(use_bpf ? NULL : &data, &eops, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { pr_err("Initializing perf session failed\n"); err = PTR_ERR(session); @@ -2142,6 +2146,7 @@ static int __cmd_contention(int argc, const char **argv) evlist__delete(con.evlist); lock_contention_finish(&con); perf_session__delete(session); + perf_env__exit(&host_env); zfree(&lockhash_table); return err; } From 46834d90a9a13549264b9581067d8f746b4b36cc Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 6 Sep 2025 14:21:45 +0200 Subject: [PATCH 289/450] crypto: ccp - Always pass in an error pointer to __sev_platform_shutdown_locked() When 9770b428b1a2 ("crypto: ccp - Move dev_info/err messages for SEV/SNP init and shutdown") moved the error messages dumping so that they don't need to be issued by the callers, it missed the case where __sev_firmware_shutdown() calls __sev_platform_shutdown_locked() with a NULL argument which leads to a NULL ptr deref on the shutdown path, during suspend to disk: #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 983 Comm: hib.sh Not tainted 6.17.0-rc4+ #1 PREEMPT(voluntary) Hardware name: Supermicro Super Server/H12SSL-i, BIOS 2.5 09/08/2022 RIP: 0010:__sev_platform_shutdown_locked.cold+0x0/0x21 [ccp] That rIP is: 00000000000006fd <__sev_platform_shutdown_locked.cold>: 6fd: 8b 13 mov (%rbx),%edx 6ff: 48 8b 7d 00 mov 0x0(%rbp),%rdi 703: 89 c1 mov %eax,%ecx Code: 74 05 31 ff 41 89 3f 49 8b 3e 89 ea 48 c7 c6 a0 8e 54 a0 41 bf 92 ff ff ff e8 e5 2e 09 e1 c6 05 2a d4 38 00 01 e9 26 af ff ff <8b> 13 48 8b 7d 00 89 c1 48 c7 c6 18 90 54 a0 89 44 24 04 e8 c1 2e RSP: 0018:ffffc90005467d00 EFLAGS: 00010282 RAX: 00000000ffffff92 RBX: 0000000000000000 RCX: 0000000000000000 ^^^^^^^^^^^^^^^^ and %rbx is nice and clean. Call Trace: __sev_firmware_shutdown.isra.0 sev_dev_destroy psp_dev_destroy sp_destroy pci_device_shutdown device_shutdown kernel_power_off hibernate.cold state_store kernfs_fop_write_iter vfs_write ksys_write do_syscall_64 entry_SYSCALL_64_after_hwframe Pass in a pointer to the function-local error var in the caller. With that addressed, suspending the ccp shows the error properly at least: ccp 0000:47:00.1: sev command 0x2 timed out, disabling PSP ccp 0000:47:00.1: SEV: failed to SHUTDOWN error 0x0, rc -110 SEV-SNP: Leaking PFN range 0x146800-0x146a00 SEV-SNP: PFN 0x146800 unassigned, dumping non-zero entries in 2M PFN region: [0x146800 - 0x146a00] ... ccp 0000:47:00.1: SEV-SNP firmware shutdown failed, rc -16, error 0x0 ACPI: PM: Preparing to enter system sleep state S5 kvm: exiting hardware virtualization reboot: Power down Btw, this driver is crying to be cleaned up to pass in a proper I/O struct which can be used to store information between the different functions, otherwise stuff like that will happen in the future again. Fixes: 9770b428b1a2 ("crypto: ccp - Move dev_info/err messages for SEV/SNP init and shutdown") Signed-off-by: Borislav Petkov (AMD) Cc: Reviewed-by: Ashish Kalra Acked-by: Tom Lendacky Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index e058ba027792..9f5ccc1720cb 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2430,7 +2430,7 @@ static void __sev_firmware_shutdown(struct sev_device *sev, bool panic) { int error; - __sev_platform_shutdown_locked(NULL); + __sev_platform_shutdown_locked(&error); if (sev_es_tmr) { /* From a0c17ed907ac3326cf3c9d6007ea222a746f5cc2 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 11 Sep 2025 13:14:06 +0000 Subject: [PATCH 290/450] iommu/amd: Fix alias device DTE setting Commit 7bea695ada0 restructured DTE flag handling but inadvertently changed the alias device configuration logic. This may cause incorrect DTE settings for certain devices. Add alias flag check before calling set_dev_entry_from_acpi(). Also move the device iteration loop inside the alias check to restrict execution to cases where alias devices are present. Fixes: 7bea695ada0 ("iommu/amd: Introduce struct ivhd_dte_flags to store persistent DTE flags") Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 6795326a6524..ba9e582a8bbe 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1455,12 +1455,12 @@ static int __init init_iommu_from_acpi(struct amd_iommu *iommu, PCI_FUNC(e->devid)); devid = e->devid; - for (dev_i = devid_start; dev_i <= devid; ++dev_i) { - if (alias) + if (alias) { + for (dev_i = devid_start; dev_i <= devid; ++dev_i) pci_seg->alias_table[dev_i] = devid_to; + set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); } set_dev_entry_from_acpi_range(iommu, devid_start, devid, flags, ext_flags); - set_dev_entry_from_acpi(iommu, devid_to, flags, ext_flags); break; case IVHD_DEV_SPECIAL: { u8 handle, type; From 07c24945cafc810bcce3ea6c336a6495e813b47f Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 12 Sep 2025 20:06:50 +0700 Subject: [PATCH 291/450] Revert "drm: Add directive to format code in comment" Commit 6cc44e9618f03f ("drm: Add directive to format code in comment") fixes original Sphinx indentation warning as introduced in 471920ce25d50b ("drm/gpuvm: Add locking helpers"), by means of using code-block:: directive. It semantically conflicts with earlier bb324f85f72284 ("drm/gpuvm: Wrap drm_gpuvm_sm_map_exec_lock() expected usage in literal code block") that did the same using double colon syntax instead. These duplicated literal code block directives causes the original warnings not being fixed. Revert 6cc44e9618f03f to keep things rolling without these warnings. Fixes: 6cc44e9618f0 ("drm: Add directive to format code in comment") Fixes: 471920ce25d5 ("drm/gpuvm: Add locking helpers") Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/drm_gpuvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/drm_gpuvm.c b/drivers/gpu/drm/drm_gpuvm.c index 60b672d3fd83..8b8a3ca8d7fc 100644 --- a/drivers/gpu/drm/drm_gpuvm.c +++ b/drivers/gpu/drm/drm_gpuvm.c @@ -2432,8 +2432,6 @@ static const struct drm_gpuvm_ops lock_ops = { * * The expected usage is: * - * .. code-block:: c - * * vm_bind { * struct drm_exec exec; * From 98c6d259319ecf6e8d027abd3f14b81324b8c0ad Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:15:03 -0700 Subject: [PATCH 292/450] mm/gup: check ref_count instead of lru before migration Patch series "mm: better GUP pin lru_add_drain_all()", v2. Series of lru_add_drain_all()-related patches, arising from recent mm/gup migration report from Will Deacon. This patch (of 5): Will Deacon reports:- When taking a longterm GUP pin via pin_user_pages(), __gup_longterm_locked() tries to migrate target folios that should not be longterm pinned, for example because they reside in a CMA region or movable zone. This is done by first pinning all of the target folios anyway, collecting all of the longterm-unpinnable target folios into a list, dropping the pins that were just taken and finally handing the list off to migrate_pages() for the actual migration. It is critically important that no unexpected references are held on the folios being migrated, otherwise the migration will fail and pin_user_pages() will return -ENOMEM to its caller. Unfortunately, it is relatively easy to observe migration failures when running pKVM (which uses pin_user_pages() on crosvm's virtual address space to resolve stage-2 page faults from the guest) on a 6.15-based Pixel 6 device and this results in the VM terminating prematurely. In the failure case, 'crosvm' has called mlock(MLOCK_ONFAULT) on its mapping of guest memory prior to the pinning. Subsequently, when pin_user_pages() walks the page-table, the relevant 'pte' is not present and so the faulting logic allocates a new folio, mlocks it with mlock_folio() and maps it in the page-table. Since commit 2fbb0c10d1e8 ("mm/munlock: mlock_page() munlock_page() batch by pagevec"), mlock/munlock operations on a folio (formerly page), are deferred. For example, mlock_folio() takes an additional reference on the target folio before placing it into a per-cpu 'folio_batch' for later processing by mlock_folio_batch(), which drops the refcount once the operation is complete. Processing of the batches is coupled with the LRU batch logic and can be forcefully drained with lru_add_drain_all() but as long as a folio remains unprocessed on the batch, its refcount will be elevated. This deferred batching therefore interacts poorly with the pKVM pinning scenario as we can find ourselves in a situation where the migration code fails to migrate a folio due to the elevated refcount from the pending mlock operation. Hugh Dickins adds:- !folio_test_lru() has never been a very reliable way to tell if an lru_add_drain_all() is worth calling, to remove LRU cache references to make the folio migratable: the LRU flag may be set even while the folio is held with an extra reference in a per-CPU LRU cache. 5.18 commit 2fbb0c10d1e8 may have made it more unreliable. Then 6.11 commit 33dfe9204f29 ("mm/gup: clear the LRU flag of a page before adding to LRU batch") tried to make it reliable, by moving LRU flag clearing; but missed the mlock/munlock batches, so still unreliable as reported. And it turns out to be difficult to extend 33dfe9204f29's LRU flag clearing to the mlock/munlock batches: if they do benefit from batching, mlock/munlock cannot be so effective when easily suppressed while !LRU. Instead, switch to an expected ref_count check, which was more reliable all along: some more false positives (unhelpful drains) than before, and never a guarantee that the folio will prove migratable, but better. Note on PG_private_2: ceph and nfs are still using the deprecated PG_private_2 flag, with the aid of netfs and filemap support functions. Although it is consistently matched by an increment of folio ref_count, folio_expected_ref_count() intentionally does not recognize it, and ceph folio migration currently depends on that for PG_private_2 folios to be rejected. New references to the deprecated flag are discouraged, so do not add it into the collect_longterm_unpinnable_folios() calculation: but longterm pinning of transiently PG_private_2 ceph and nfs folios (an uncommon case) may invoke a redundant lru_add_drain_all(). And this makes easy the backport to earlier releases: up to and including 6.12, btrfs also used PG_private_2, but without a ref_count increment. Note for stable backports: requires 6.16 commit 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation"). Link: https://lkml.kernel.org/r/41395944-b0e3-c3ac-d648-8ddd70451d28@google.com Link: https://lkml.kernel.org/r/bd1f314a-fca1-8f19-cac0-b936c9614557@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Reported-by: Will Deacon Closes: https://lore.kernel.org/linux-mm/20250815101858.24352-1-will@kernel.org/ Acked-by: Kiryl Shutsemau Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..82aec6443c0a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,7 +2307,8 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drain_allow && folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); drain_allow = false; } From a09a8a1fbb374e0053b97306da9dbc05bd384685 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:16:53 -0700 Subject: [PATCH 293/450] mm/gup: local lru_add_drain() to avoid lru_add_drain_all() In many cases, if collect_longterm_unpinnable_folios() does need to drain the LRU cache to release a reference, the cache in question is on this same CPU, and much more efficiently drained by a preliminary local lru_add_drain(), than the later cross-CPU lru_add_drain_all(). Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Note for clean backports: can take 6.16 commit a03db236aebf ("gup: optimize longterm pin_user_pages() for large folio") first. Link: https://lkml.kernel.org/r/66f2751f-283e-816d-9530-765db7edc465@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 82aec6443c0a..b47066a54f52 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios( struct pages_or_folios *pofs) { unsigned long collected = 0; - bool drain_allow = true; struct folio *folio; + int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; @@ -2307,10 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drain_allow && folio_ref_count(folio) != - folio_expected_ref_count(folio) + 1) { + if (drained == 0 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) From afb99e9f500485160f34b8cad6d3763ada3e80e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:19:17 -0700 Subject: [PATCH 294/450] mm: revert "mm/gup: clear the LRU flag of a page before adding to LRU batch" This reverts commit 33dfe9204f29: now that collect_longterm_unpinnable_folios() is checking ref_count instead of lru, and mlock/munlock do not participate in the revised LRU flag clearing, those changes are misleading, and enlarge the window during which mlock/munlock may miss an mlock_count update. It is possible (I'd hesitate to claim probable) that the greater likelihood of missed mlock_count updates would explain the "Realtime threads delayed due to kcompactd0" observed on 6.12 in the Link below. If that is the case, this reversion will help; but a complete solution needs also a further patch, beyond the scope of this series. Included some 80-column cleanup around folio_batch_add_and_move(). The role of folio_test_clear_lru() (before taking per-memcg lru_lock) is questionable since 6.13 removed mem_cgroup_move_account() etc; but perhaps there are still some races which need it - not examined here. Link: https://lore.kernel.org/linux-mm/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/ Link: https://lkml.kernel.org/r/05905d7b-ed14-68b1-79d8-bdec30367eba@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/swap.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..6ae2d5680574 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add && !folio_test_clear_lru(folio)) + continue; + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, - struct folio *folio, move_fn_t move_fn, - bool on_lru, bool disable_irq) + struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; - if (on_lru && !folio_test_clear_lru(folio)) - return; - folio_get(folio); if (disable_irq) @@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, else local_lock(&cpu_fbatches.lock); - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || - lru_cache_disabled()) + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || + folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) @@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_unlock(&cpu_fbatches.lock); } -#define folio_batch_add_and_move(folio, op, on_lru) \ - __folio_batch_add_and_move( \ - &cpu_fbatches.op, \ - folio, \ - op, \ - on_lru, \ - offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ +#define folio_batch_add_and_move(folio, op) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + offsetof(struct cpu_fbatches, op) >= \ + offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) @@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || - folio_test_unevictable(folio)) + folio_test_unevictable(folio) || !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_move_tail, true); + folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, @@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_active(folio) || folio_test_unevictable(folio)) + if (folio_test_active(folio) || folio_test_unevictable(folio) || + !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_activate, true); + folio_batch_add_and_move(folio, lru_activate); } #else @@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_batch_add_and_move(folio, lru_add, false); + folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); @@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu) void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate_file, true); + folio_batch_add_and_move(folio, lru_deactivate_file); } /* @@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate, true); + folio_batch_add_and_move(folio, lru_deactivate); } /** @@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio) void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; - folio_batch_add_and_move(folio, lru_lazyfree, true); + folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) From 8d79ed36bfc83d0583ab72216b7980340478cdfb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:21:12 -0700 Subject: [PATCH 295/450] mm: revert "mm: vmscan.c: fix OOM on swap stress test" This reverts commit 0885ef470560: that was a fix to the reverted 33dfe9204f29b415bbc0abb1a50642d1ba94f5e9. Link: https://lkml.kernel.org/r/aa0e9d67-fbcd-9d79-88a1-641dfbe1d9d1@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd92..674999999cd0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From 2da6de30e60dd9bb14600eff1cc99df2fa2ddae3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:23:15 -0700 Subject: [PATCH 296/450] mm: folio_may_be_lru_cached() unless folio_test_large() mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a large folio is added: so collect_longterm_unpinnable_folios() just wastes effort when calling lru_add_drain[_all]() on a large folio. But although there is good reason not to batch up PMD-sized folios, we might well benefit from batching a small number of low-order mTHPs (though unclear how that "small number" limitation will be implemented). So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to insulate those particular checks from future change. Name preferred to "folio_is_batchable" because large folios can well be put on a batch: it's just the per-CPU LRU caches, drained much later, which need care. Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 ++++++++++ mm/gup.c | 4 ++-- mm/mlock.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..7012a0f758d8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/mm/gup.c b/mm/gup.c index b47066a54f52..0bc4d140fc07 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,13 +2307,13 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drained == 0 && + if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } - if (drained == 1 && + if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6d..bb0776f5ef7c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/swap.c b/mm/swap.c index 6ae2d5680574..b74ebe865dd9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -192,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) From e6a0deb6fa5b0fc134ee2aa127d1cfc9456d8445 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:12 -0700 Subject: [PATCH 297/450] mm/damon/core: introduce damon_call_control->dealloc_on_cancel Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on multi-kdamonds usages". Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs interface (refresh_ms) is broken [1] for multiple DAMON contexts (kdamonds) use case, since it uses a global single damon_call_control object for all created DAMON contexts. The fields of the object, particularly the list field is over-written for the contexts and it makes unexpected results including user-space hangup and kernel crashes [2]. Fix it by extending damon_call_control for the use case and updating the usage on DAMON sysfs interface to use per-context dynamically allocated damon_call_control object. This patch (of 2): When damon_call_control->repeat is set, damon_call() is executed asynchronously, and is eventually canceled when kdamond finishes. If the damon_call_control object is dynamically allocated, finding the place to deallocate the object is difficult. Introduce a new damon_call_control field, namely dealloc_on_cancel, to ask the kdamond deallocates those dynamically allocated objects when those are canceled. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..9e62b2a85538 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/mm/damon/core.c b/mm/damon/core.c index c2e0b469fd43..08065b363972 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); From 04a06b139ec08aa63d7377f6d3e5218f8ddb1c5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:13 -0700 Subject: [PATCH 298/450] mm/damon/sysfs: use dynamically allocated repeat mode damon_call_control DAMON sysfs interface is using a single global repeat mode damon_call_control variable for refresh_ms handling, for all DAMON contexts. As a result, when there are more than one context, the single global damon_call_control is unexpectedly over-written (corrupted). Particularly the ->link field is overwritten by the multiple contexts and this can cause a user hangup, and/or a kernel crash. Fix it by using dynamically allocated damon_call_control object per DAMON context. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com [1] Link: https://lore.kernel.org/20250905035411.39501-1-sj@kernel.org [2] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Reported-by: Yunjeong Mun Closes: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7b9254cadd5f..c96c2154128f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1534,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data) return 0; } -static struct damon_call_control damon_sysfs_repeat_call_control = { - .fn = damon_sysfs_repeat_call_fn, - .repeat = true, -}; - static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; + struct damon_call_control *repeat_call_control; int err; if (damon_sysfs_kdamond_running(kdamond)) @@ -1554,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; + repeat_call_control = kmalloc(sizeof(*repeat_call_control), + GFP_KERNEL); + if (!repeat_call_control) + return -ENOMEM; + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + kfree(repeat_call_control); return PTR_ERR(ctx); + } err = damon_start(&ctx, 1, false); if (err) { + kfree(repeat_call_control); damon_destroy_ctx(ctx); return err; } kdamond->damon_ctx = ctx; - damon_sysfs_repeat_call_control.data = kdamond; - damon_call(ctx, &damon_sysfs_repeat_call_control); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; + repeat_call_control->data = kdamond; + repeat_call_control->repeat = true; + repeat_call_control->dealloc_on_cancel = true; + damon_call(ctx, repeat_call_control); return err; } From 615cd3705d204680f4ae8d0ad0dec8b778dc2753 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 8 Sep 2025 20:49:59 +0100 Subject: [PATCH 299/450] MAINTAINERS: add Jann Horn as rmap reviewer Jann has been an excellent contributor in all areas of memory management, and has demonstrated great expertise in the reverse mapping. It's therefore appropriate for him to become a reviewer. Link: https://lkml.kernel.org/r/20250908194959.820913-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Harry Yoo Acked-by: SeongJae Park Acked-by: Vlastimil Babka Acked-by: Liam R. Howlett Cc: Jann Horn Cc: Rik van Riel Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fbdbf7c012a0..b71b8c6813aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16195,6 +16195,7 @@ R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo +R: Jann Horn L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h From 72291a5a0ead8686e3cbfd35baa7d7b1cfdbf6ea Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 8 Sep 2025 18:48:57 +0800 Subject: [PATCH 300/450] MAINTAINERS: add Lance Yang as a THP reviewer I've been actively digging into the MM/THP subsystem for over a year now, and there's a real interest in contributing more and getting further involved. Well, missing out on any more cool THP things is really a pain ;) Link: https://lkml.kernel.org/r/20250908104857.35397-1-lance.yang@linux.dev Signed-off-by: Lance Yang Acked-by: David Hildenbrand Acked-by: Lorenzo Stoakes Acked-by: Zi Yan Acked-by: Barry Song Acked-by: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b71b8c6813aa..03b433441836 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16240,6 +16240,7 @@ R: Nico Pache R: Ryan Roberts R: Dev Jain R: Barry Song +R: Lance Yang L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From f826edeb888c5a8bd1b6e95ae6a50b0db2b21902 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:36 -0700 Subject: [PATCH 301/450] samples/damon/wsse: avoid starting DAMON before initialization Patch series "samples/damon: fix boot time enable handling fixup merge mistakes". First three patches of the patch series "mm/damon: fix misc bugs in DAMON modules" [1] were trying to fix boot time DAMON sample modules enabling issues. The issues are the modules can crash if those are enabled before DAMON is enabled, like using boot time parameter options. The three patches were fixing the issues by avoiding starting DAMON before the module initialization phase. However, probably by a mistake during a merge, only half of the change is merged, and the part for avoiding the starting of DAMON before the module initialized is missed. So the problem is not solved and thus the modules can still crash if enabled before DAMON is initialized. Fix those by applying the unmerged parts again. Note that the broken commits are merged into 6.17-rc1, but also backported to relevant stable kernels. So this series also needs to be merged into the stable kernels. Hence Cc-ing stable@. This patch (of 3): Commit 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") is somehow incompletely applying the origin patch [2]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250909022238.2989-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250706193207.39810-1-sj@kernel.org [1] Link: https://lore.kernel.org/20250706193207.39810-2-sj@kernel.org [2] Fixes: 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/wsse.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c index da052023b099..21eaf15f987d 100644 --- a/samples/damon/wsse.c +++ b/samples/damon/wsse.c @@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store( return 0; if (enabled) { + if (!init_called) + return 0; + err = damon_sample_wsse_start(); if (err) enabled = false; From e6b733ca2f99e968d696c2e812c8eb8e090bf37b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:37 -0700 Subject: [PATCH 302/450] samples/damon/prcl: avoid starting DAMON before initialization Commit 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-3-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-3-sj@kernel.org [1] Fixes: 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/prcl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index 1b839c06a612..0226652f94d5 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_prcl_start(); if (err) From c62cff40481c037307a13becbda795f7afdcfebd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:38 -0700 Subject: [PATCH 303/450] samples/damon/mtier: avoid starting DAMON before initialization Commit 964314344eab ("samples/damon/mtier: support boot time enable setup") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-4-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-4-sj@kernel.org [1] Fixes: 964314344eab ("samples/damon/mtier: support boot time enable setup") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/mtier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index 7ebd352138e4..beaf36657dea 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_mtier_start(); if (err) From 025e87f8ea2ae3a28bf1fe2b052bfa412c27ed4a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 6 Sep 2025 23:43:34 +0900 Subject: [PATCH 304/450] nilfs2: fix CFI failure when accessing /sys/fs/nilfs2/features/* When accessing one of the files under /sys/fs/nilfs2/features when CONFIG_CFI_CLANG is enabled, there is a CFI violation: CFI failure at kobj_attr_show+0x59/0x80 (target: nilfs_feature_revision_show+0x0/0x30; expected type: 0xfc392c4d) ... Call Trace: sysfs_kf_seq_show+0x2a6/0x390 ? __cfi_kobj_attr_show+0x10/0x10 kernfs_seq_show+0x104/0x15b seq_read_iter+0x580/0xe2b ... When the kobject of the kset for /sys/fs/nilfs2 is initialized, its ktype is set to kset_ktype, which has a ->sysfs_ops of kobj_sysfs_ops. When nilfs_feature_attr_group is added to that kobject via sysfs_create_group(), the kernfs_ops of each files is sysfs_file_kfops_rw, which will call sysfs_kf_seq_show() when ->seq_show() is called. sysfs_kf_seq_show() in turn calls kobj_attr_show() through ->sysfs_ops->show(). kobj_attr_show() casts the provided attribute out to a 'struct kobj_attribute' via container_of() and calls ->show(), resulting in the CFI violation since neither nilfs_feature_revision_show() nor nilfs_feature_README_show() match the prototype of ->show() in 'struct kobj_attribute'. Resolve the CFI violation by adjusting the second parameter in nilfs_feature_{revision,README}_show() from 'struct attribute' to 'struct kobj_attribute' to match the expected prototype. Link: https://lkml.kernel.org/r/20250906144410.22511-1-konishi.ryusuke@gmail.com Fixes: aebe17f68444 ("nilfs2: add /sys/fs/nilfs2/features group") Signed-off-by: Nathan Chancellor Signed-off-by: Ryusuke Konishi Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509021646.bc78d9ef-lkp@intel.com/ Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sysfs.c | 4 ++-- fs/nilfs2/sysfs.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592..bc52afbfc5c7 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928..d370cd5cce3f 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ From 30989f67650cbf8dc763f7c22e3a210f70a8d7d0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Sep 2025 16:25:27 +0200 Subject: [PATCH 305/450] MAINTAINERS: Input: Drop melfas-mip4 section Emails to the sole melfas-mip4 driver maintainer bounce: 550 No such user here (connected from melfas.com) so clearly this is not a supported driver anymore. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250910142526.105286-2-krzysztof.kozlowski@linaro.org Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ecca2a4f96fd..6d6dd9f1316c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15523,13 +15523,6 @@ S: Supported W: http://www.melexis.com F: drivers/iio/temperature/mlx90635.c -MELFAS MIP4 TOUCHSCREEN DRIVER -M: Sangwon Jee -S: Supported -W: http://www.melfas.com -F: Documentation/devicetree/bindings/input/touchscreen/melfas_mip4.txt -F: drivers/input/touchscreen/melfas_mip4.c - MELLANOX BLUEFIELD I2C DRIVER M: Khalil Blaiech M: Asmaa Mnebhi From ec8f26092e525ee3cb1a56d455109fd40bfff3ef Mon Sep 17 00:00:00 2001 From: Donald Menig Date: Sun, 14 Sep 2025 09:43:33 +0200 Subject: [PATCH 306/450] ALSA: hda/realtek: Add ALC295 Dell TAS2781 I2C fixup This patch adds a new fixup for the ALC295 codec on some Dell laptops that use the TAS2781 I2C amplifier. The fixup correctly initializes the amplifier and pins, allowing sound to work on all speakers of these devices. The fixup chain is added to the relevant quirk entries for Dell Polaris models. [ adjusted for 6.17 kernel code by tiwai ] Fixes: 1e9c708dc3ae ("ALSA: hda/tas2781: Add new quirk for Lenovo, ASUS, Dell projects") Link: https://bugzilla.suse.com/show_bug.cgi?id=1249575 Signed-off-by: Donald Menig Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 85bb8c4d3b17..59cc01079fee 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -3702,6 +3702,7 @@ enum { ALC236_FIXUP_DELL_DUAL_CODECS, ALC287_FIXUP_CS35L41_I2C_2_THINKPAD_ACPI, ALC287_FIXUP_TAS2781_I2C, + ALC295_FIXUP_DELL_TAS2781_I2C, ALC245_FIXUP_TAS2781_SPI_2, ALC287_FIXUP_TXNW2781_I2C, ALC287_FIXUP_YOGA7_14ARB7_I2C, @@ -5167,6 +5168,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc294_fixup_gx502_hp, }, + [ALC295_FIXUP_DELL_TAS2781_I2C] = { + .type = HDA_FIXUP_FUNC, + .v.func = tas2781_fixup_tias_i2c, + .chained = true, + .chain_id = ALC289_FIXUP_DUAL_SPK + }, [ALC294_FIXUP_ASUS_GU502_PINS] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -6289,8 +6296,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0c1e, "Dell Precision 3540", ALC236_FIXUP_DELL_DUAL_CODECS), SND_PCI_QUIRK(0x1028, 0x0c28, "Dell Inspiron 16 Plus 7630", ALC295_FIXUP_DELL_INSPIRON_TOP_SPEAKERS), SND_PCI_QUIRK(0x1028, 0x0c4d, "Dell", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c94, "Dell Polaris 3 metal", ALC295_FIXUP_DELL_TAS2781_I2C), + SND_PCI_QUIRK(0x1028, 0x0c96, "Dell Polaris 2in1", ALC295_FIXUP_DELL_TAS2781_I2C), SND_PCI_QUIRK(0x1028, 0x0cbd, "Dell Oasis 13 CS MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbe, "Dell Oasis 13 2-IN-1 MTL-U", ALC289_FIXUP_DELL_CS35L41_SPI_2), SND_PCI_QUIRK(0x1028, 0x0cbf, "Dell Oasis 13 Low Weight MTU-L", ALC289_FIXUP_DELL_CS35L41_SPI_2), From 2e7bba08923ebc675b1f0e0e0959e68e53047838 Mon Sep 17 00:00:00 2001 From: Anderson Nascimento Date: Thu, 11 Sep 2025 20:07:44 -0300 Subject: [PATCH 307/450] net/tcp: Fix a NULL pointer dereference when using TCP-AO with TCP_REPAIR A NULL pointer dereference can occur in tcp_ao_finish_connect() during a connect() system call on a socket with a TCP-AO key added and TCP_REPAIR enabled. The function is called with skb being NULL and attempts to dereference it on tcp_hdr(skb)->seq without a prior skb validation. Fix this by checking if skb is NULL before dereferencing it. The commentary is taken from bpf_skops_established(), which is also called in the same flow. Unlike the function being patched, bpf_skops_established() validates the skb before dereferencing it. int main(void){ struct sockaddr_in sockaddr; struct tcp_ao_add tcp_ao; int sk; int one = 1; memset(&sockaddr,'\0',sizeof(sockaddr)); memset(&tcp_ao,'\0',sizeof(tcp_ao)); sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); sockaddr.sin_family = AF_INET; memcpy(tcp_ao.alg_name,"cmac(aes128)",12); memcpy(tcp_ao.key,"ABCDEFGHABCDEFGH",16); tcp_ao.keylen = 16; memcpy(&tcp_ao.addr,&sockaddr,sizeof(sockaddr)); setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tcp_ao, sizeof(tcp_ao)); setsockopt(sk, IPPROTO_TCP, TCP_REPAIR, &one, sizeof(one)); sockaddr.sin_family = AF_INET; sockaddr.sin_port = htobe16(123); inet_aton("127.0.0.1", &sockaddr.sin_addr); connect(sk,(struct sockaddr *)&sockaddr,sizeof(sockaddr)); return 0; } $ gcc tcp-ao-nullptr.c -o tcp-ao-nullptr -Wall $ unshare -Urn BUG: kernel NULL pointer dereference, address: 00000000000000b6 PGD 1f648d067 P4D 1f648d067 PUD 1982e8067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 RIP: 0010:tcp_ao_finish_connect (net/ipv4/tcp_ao.c:1182) Fixes: 7c2ffaf21bd6 ("net/tcp: Calculate TCP-AO traffic keys") Signed-off-by: Anderson Nascimento Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911230743.2551-3-anderson@allelesecurity.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..3338b6cc85c4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) From 70d99623d5c11e1a9bcc564b8fbad6fa916913d8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 12 Sep 2025 11:33:31 +0200 Subject: [PATCH 308/450] dpll: fix clock quality level reporting The DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC is not reported via netlink due to bug in dpll_msg_add_clock_quality_level(). The usage of DPLL_CLOCK_QUALITY_LEVEL_MAX for both DECLARE_BITMAP() and for_each_set_bit() is not correct because these macros requires bitmap size and not the highest valid bit in the bitmap. Use correct bitmap size to fix this issue. Fixes: a1afb959add1 ("dpll: add clock quality level attribute and op") Signed-off-by: Ivan Vecera Reviewed-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250912093331.862333-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 036f21cac0a9..0a852011653c 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -211,8 +211,8 @@ static int dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) { + DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) = { 0 }; const struct dpll_device_ops *ops = dpll_device_ops(dpll); - DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 }; enum dpll_clock_quality_level ql; int ret; @@ -221,7 +221,7 @@ dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack); if (ret) return ret; - for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) + for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql)) return -EMSGSIZE; From 64863f4ca4945bdb62ce2b30823f39ea9fe95415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Sep 2025 23:58:16 +0100 Subject: [PATCH 309/450] rxrpc: Fix unhandled errors in rxgk_verify_packet_integrity() rxgk_verify_packet_integrity() may get more errors than just -EPROTO from rxgk_verify_mic_skb(). Pretty much anything other than -ENOMEM constitutes an unrecoverable error. In the case of -ENOMEM, we can just drop the packet and wait for a retransmission. Similar happens with rxgk_decrypt_skb() and its callers. Fix rxgk_decrypt_skb() or rxgk_verify_mic_skb() to return a greater variety of abort codes and fix their callers to abort the connection on any error apart from -ENOMEM. Also preclear the variables used to hold the abort code returned from rxgk_decrypt_skb() or rxgk_verify_mic_skb() to eliminate uninitialised variable warnings. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009739.html Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009740.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2038804.1757631496@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 18 ++++++++++-------- net/rxrpc/rxgk_app.c | 10 ++++++---- net/rxrpc/rxgk_common.h | 14 ++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc8..dce5a3d8a964 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c317..df684b5a8531 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -187,7 +187,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -236,9 +236,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a5655985..80164d89e19c 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } From 2429a197648178cd4dc930a9d87c13c547460564 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Sep 2025 00:06:17 +0100 Subject: [PATCH 310/450] rxrpc: Fix untrusted unsigned subtract Fix the following Smatch static checker warning: net/rxrpc/rxgk_app.c:65 rxgk_yfs_decode_ticket() warn: untrusted unsigned subtract. 'ticket_len - 10 * 4' by prechecking the length of what we're trying to extract in two places in the token and decoding for a response packet. Also use sizeof() on the struct we're extracting rather specifying the size numerically to be consistent with the other related statements. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-September/010135.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2039268.1757631977@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk_app.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index df684b5a8531..30275cb5ba3e 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -285,4 +290,8 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } From af82e857df5dd883a4867bcaf5dde041e57a4e33 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 11 Sep 2025 18:36:10 -0400 Subject: [PATCH 311/450] octeon_ep: Validate the VF ID Add a helper to validate the VF ID and use it in the VF ndo ops to prevent accessing out-of-range entries. Without this check, users can run commands such as: # ip link show dev enp135s0 2: enp135s0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 00:00:00:01:01:00 brd ff:ff:ff:ff:ff:ff vf 0 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off vf 1 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off # ip link set dev enp135s0 vf 4 mac 00:00:00:00:00:14 # echo $? 0 even though VF 4 does not exist, which results in silent success instead of returning an error. Fixes: 8a241ef9b9b8 ("octeon_ep: add ndo ops for VFs in PF driver") Signed-off-by: Kamal Heib Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250911223610.1803144-1-kheib@redhat.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeon_ep/octep_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 24499bb36c00..bcea3fc26a8c 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1124,11 +1124,24 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static bool octep_is_vf_valid(struct octep_device *oct, int vf) +{ + if (vf >= CFG_GET_ACTIVE_VFS(oct->conf)) { + netdev_err(oct->netdev, "Invalid VF ID %d\n", vf); + return false; + } + + return true; +} + static int octep_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi) { struct octep_device *oct = netdev_priv(dev); + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + ivi->vf = vf; ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); ivi->spoofchk = true; @@ -1143,6 +1156,9 @@ static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) struct octep_device *oct = netdev_priv(dev); int err; + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + if (!is_valid_ether_addr(mac)) { dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); return -EADDRNOTAVAIL; From f83ec76bf285bea5727f478a68b894f5543ca76e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Sep 2025 14:21:14 -0700 Subject: [PATCH 312/450] Linux 6.17-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cf37b9407821..9771619ac596 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Baby Opossum Posse # *DOCUMENTATION* From 56c0a2a9ddc2f5b5078c5fb0f81ab76bbc3d4c37 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 10 Sep 2025 16:29:16 +1000 Subject: [PATCH 313/450] qed: Don't collect too many protection override GRC elements In the protection override dump path, the firmware can return far too many GRC elements, resulting in attempting to write past the end of the previously-kmalloc'ed dump buffer. This will result in a kernel panic with reason: BUG: unable to handle kernel paging request at ADDRESS where "ADDRESS" is just past the end of the protection override dump buffer. The start address of the buffer is: p_hwfn->cdev->dbg_features[DBG_FEATURE_PROTECTION_OVERRIDE].dump_buf and the size of the buffer is buf_size in the same data structure. The panic can be arrived at from either the qede Ethernet driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc02662ed [qed] qed_dbg_protection_override_dump at ffffffffc0267792 [qed] qed_dbg_feature at ffffffffc026aa8f [qed] qed_dbg_all_data at ffffffffc026b211 [qed] qed_fw_fatal_reporter_dump at ffffffffc027298a [qed] devlink_health_do_dump at ffffffff82497f61 devlink_health_report at ffffffff8249cf29 qed_report_fatal_error at ffffffffc0272baf [qed] qede_sp_task at ffffffffc045ed32 [qede] process_one_work at ffffffff81d19783 or the qedf storage driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc068b2ed [qed] qed_dbg_protection_override_dump at ffffffffc068c792 [qed] qed_dbg_feature at ffffffffc068fa8f [qed] qed_dbg_all_data at ffffffffc0690211 [qed] qed_fw_fatal_reporter_dump at ffffffffc069798a [qed] devlink_health_do_dump at ffffffff8aa95e51 devlink_health_report at ffffffff8aa9ae19 qed_report_fatal_error at ffffffffc0697baf [qed] qed_hw_err_notify at ffffffffc06d32d7 [qed] qed_spq_post at ffffffffc06b1011 [qed] qed_fcoe_destroy_conn at ffffffffc06b2e91 [qed] qedf_cleanup_fcport at ffffffffc05e7597 [qedf] qedf_rport_event_handler at ffffffffc05e7bf7 [qedf] fc_rport_work at ffffffffc02da715 [libfc] process_one_work at ffffffff8a319663 Resolve this by clamping the firmware's return value to the maximum number of legal elements the firmware should return. Fixes: d52c89f120de8 ("qed*: Utilize FW 8.37.2.0") Signed-off-by: Jamie Bainbridge Link: https://patch.msgid.link/f8e1182934aa274c18d0682a12dbaf347595469c.1757485536.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 9c3d3dd2f847..1f0cea3cae92 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -4462,10 +4462,11 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn, goto out; } - /* Add override window info to buffer */ + /* Add override window info to buffer, preventing buffer overflow */ override_window_dwords = - qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * - PROTECTION_OVERRIDE_ELEMENT_DWORDS; + min(qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * + PROTECTION_OVERRIDE_ELEMENT_DWORDS, + PROTECTION_OVERRIDE_DEPTH_DWORDS); if (override_window_dwords) { addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW); offset += qed_grc_dump_addr_range(p_hwfn, From a9888628cb2c768202a4530e2816da1889cc3165 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:15 +0200 Subject: [PATCH 314/450] net: dst_metadata: fix IP_DF bit not extracted from tunnel headers Both OVS and TC flower allow extracting and matching on the DF bit of the outer IP header via OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT in the OVS_KEY_ATTR_TUNNEL and TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT in the TCA_FLOWER_KEY_ENC_FLAGS respectively. Flow dissector extracts this information as FLOW_DIS_F_TUNNEL_DONT_FRAGMENT from the tunnel info key. However, the IP_TUNNEL_DONT_FRAGMENT_BIT in the tunnel key is never actually set, because the tunneling code doesn't actually extract it from the IP header. OAM and CRIT_OPT are extracted by the tunnel implementation code, same code also sets the KEY flag, if present. UDP tunnel core takes care of setting the CSUM flag if the checksum is present in the UDP header, but the DONT_FRAGMENT is not handled at any layer. Fix that by checking the bit and setting the corresponding flag while populating the tunnel info in the IP layer where it belongs. Not using __assign_bit as we don't really need to clear the bit in a just initialized field. It also doesn't seem like using __assign_bit will make the code look better. Clearly, users didn't rely on this functionality for anything very important until now. The reason why this doesn't break OVS logic is that it only matches on what kernel previously parsed out and if kernel consistently reports this bit as zero, OVS will only match on it to be zero, which sort of works. But it is still a bug that the uAPI reports and allows matching on the field that is not actually checked in the packet. And this is causing misleading -df reporting in OVS datapath flows, while the tunnel traffic actually has the bit set in most cases. This may also cause issues if a hardware properly implements support for tunnel flag matching as it will disagree with the implementation in a software path of TC flower. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Fixes: 1d17568e74de ("net/sched: cls_flower: add support for matching tunnel control flags") Signed-off-by: Ilya Maximets Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250909165440.229890-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); + + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, From 6cafb93c1f2aec7875f7a024a53e15f0683df699 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:16 +0200 Subject: [PATCH 315/450] selftests: openvswitch: add a simple test for tunnel metadata This test ensures that upon receiving decapsulated packets from a tunnel interface in openvswitch, the tunnel metadata fields are properly populated. This partially covers interoperability of the kernel tunnel ports and openvswitch tunnels (LWT) and parsing and formatting of the tunnel metadata fields of the openvswitch netlink uAPI. Doing so, this test also ensures that fields and flags are properly extracted during decapsulation by the tunnel core code, serving as a regression test for the previously fixed issue with the DF bit not being extracted from the outer IP header. The ovs-dpctl.py script already supports all that is necessary for the tunnel ports for this test, so we only need to adjust the ovs_add_if() function to pass the '-t' port type argument in order to be able to create tunnel ports in the openvswitch datapath. Reviewed-by: Aaron Conole Signed-off-by: Ilya Maximets Link: https://patch.msgid.link/20250909165440.229890-3-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- .../selftests/net/openvswitch/openvswitch.sh | 88 +++++++++++++++++-- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7..b327d3061ed5 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" From d162694037215fe25f1487999c58d70df809a2fd Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 4 Sep 2025 13:06:41 +0200 Subject: [PATCH 316/450] smb: server: let smb_direct_writev() respect SMB_DIRECT_MAX_SEND_SGES We should not use more sges for ib_post_send() than we told the rdma device in rdma_create_qp()! Otherwise ib_post_send() will return -EINVAL, so we disconnect the connection. Or with the current siw.ko we'll get 0 from ib_post_send(), but will never ever get a completion for the request. I've already sent a fix for siw.ko... So we need to make sure smb_direct_writev() limits the number of vectors we pass to individual smb_direct_post_send_data() calls, so that we don't go over the queue pair limits. Commit 621433b7e25d ("ksmbd: smbd: relax the count of sges required") was very strange and I guess only needed because SMB_DIRECT_MAX_SEND_SGES was 8 at that time. It basically removed the check that the rdma device is able to handle the number of sges we try to use. While the real problem was added by commit ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write") as it used the minumun of device->attrs.max_send_sge and device->attrs.max_sge_rd, with the problem that device->attrs.max_sge_rd is always 1 for iWarp. And that limitation should only apply to RDMA Read operations. For now we keep that limitation for RDMA Write operations too, fixing that is a task for another day as it's not really required a bug fix. Commit 2b4eeeaa9061 ("ksmbd: decrease the number of SMB3 smbdirect server SGEs") lowered SMB_DIRECT_MAX_SEND_SGES to 6, which is also used by our client code. And that client code enforces device->attrs.max_send_sge >= 6 since commit d2e81f92e5b7 ("Decrease the number of SMB3 smbdirect client SGEs") and (briefly looking) only the i40w driver provides only 3, see I40IW_MAX_WQ_FRAGMENT_COUNT. But currently we'd require 4 anyway, so that would not work anyway, but now it fails early. Cc: Steve French Cc: Tom Talpey Cc: Hyunchul Lee Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: linux-rdma@vger.kernel.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Fixes: ddbdc861e37c ("ksmbd: smbd: introduce read/write credits for RDMA read/write") Fixes: 621433b7e25d ("ksmbd: smbd: relax the count of sges required") Fixes: 2b4eeeaa9061 ("ksmbd: decrease the number of SMB3 smbdirect server SGEs") Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 159 ++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 51 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 5466aa8c39b1..cc4322bfa1d6 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1209,78 +1209,130 @@ static int smb_direct_writev(struct ksmbd_transport *t, bool need_invalidate, unsigned int remote_key) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); - int remaining_data_length; - int start, i, j; - int max_iov_size = st->max_send_size - + size_t remaining_data_length; + size_t iov_idx; + size_t iov_ofs; + size_t max_iov_size = st->max_send_size - sizeof(struct smb_direct_data_transfer); int ret; - struct kvec vec; struct smb_direct_send_ctx send_ctx; + int error = 0; if (st->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; //FIXME: skip RFC1002 header.. + if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) + return -EINVAL; buflen -= 4; + iov_idx = 1; + iov_ofs = 0; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 1; - buflen = 0; - while (true) { - buflen += iov[i].iov_len; - if (buflen > max_iov_size) { - if (i > start) { - remaining_data_length -= - (buflen - iov[i].iov_len); - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) - goto done; - } else { - /* iov[start] is too big, break it */ - int nvec = (buflen + max_iov_size - 1) / - max_iov_size; + while (remaining_data_length) { + struct kvec vecs[SMB_DIRECT_MAX_SEND_SGES - 1]; /* minus smbdirect hdr */ + size_t possible_bytes = max_iov_size; + size_t possible_vecs; + size_t bytes = 0; + size_t nvecs = 0; - for (j = 0; j < nvec; j++) { - vec.iov_base = - (char *)iov[start].iov_base + - j * max_iov_size; - vec.iov_len = - min_t(int, max_iov_size, - buflen - max_iov_size * j); - remaining_data_length -= vec.iov_len; - ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, - remaining_data_length); - if (ret) - goto done; - } - i++; - if (i == niovs) - break; - } - start = i; - buflen = 0; - } else { - i++; - if (i == niovs) { - /* send out all remaining vecs */ - remaining_data_length -= buflen; - ret = smb_direct_post_send_data(st, &send_ctx, - &iov[start], i - start, - remaining_data_length); - if (ret) + /* + * For the last message remaining_data_length should be + * have been 0 already! + */ + if (WARN_ON_ONCE(iov_idx >= niovs)) { + error = -EINVAL; + goto done; + } + + /* + * We have 2 factors which limit the arguments we pass + * to smb_direct_post_send_data(): + * + * 1. The number of supported sges for the send, + * while one is reserved for the smbdirect header. + * And we currently need one SGE per page. + * 2. The number of negotiated payload bytes per send. + */ + possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); + + while (iov_idx < niovs && possible_vecs && possible_bytes) { + struct kvec *v = &vecs[nvecs]; + int page_count; + + v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; + v->iov_len = min_t(size_t, + iov[iov_idx].iov_len - iov_ofs, + possible_bytes); + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (page_count > possible_vecs) { + /* + * If the number of pages in the buffer + * is to much (because we currently require + * one SGE per page), we need to limit the + * length. + * + * We know possible_vecs is at least 1, + * so we always keep the first page. + * + * We need to calculate the number extra + * pages (epages) we can also keep. + * + * We calculate the number of bytes in the + * first page (fplen), this should never be + * larger than v->iov_len because page_count is + * at least 2, but adding a limitation feels + * better. + * + * Then we calculate the number of bytes (elen) + * we can keep for the extra pages. + */ + size_t epages = possible_vecs - 1; + size_t fpofs = offset_in_page(v->iov_base); + size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); + size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); + + v->iov_len = fplen + elen; + page_count = get_buf_page_count(v->iov_base, v->iov_len); + if (WARN_ON_ONCE(page_count > possible_vecs)) { + /* + * Something went wrong in the above + * logic... + */ + error = -EINVAL; goto done; - break; + } } + possible_vecs -= page_count; + nvecs += 1; + possible_bytes -= v->iov_len; + bytes += v->iov_len; + + iov_ofs += v->iov_len; + if (iov_ofs >= iov[iov_idx].iov_len) { + iov_idx += 1; + iov_ofs = 0; + } + } + + remaining_data_length -= bytes; + + ret = smb_direct_post_send_data(st, &send_ctx, + vecs, nvecs, + remaining_data_length); + if (unlikely(ret)) { + error = ret; + goto done; } } done: ret = smb_direct_flush_send_list(st, &send_ctx, true); + if (unlikely(!ret && error)) + ret = error; /* * As an optimization, we don't wait for individual I/O to finish @@ -1744,6 +1796,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } + if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1767,7 +1824,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = max_sge_per_wr; + cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; cap->max_rdma_ctxs = t->max_rw_credits; From 5282491fc49d5614ac6ddcd012e5743eecb6a67c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 10 Sep 2025 11:22:52 +0900 Subject: [PATCH 317/450] ksmbd: smbdirect: validate data_offset and data_length field of smb_direct_data_transfer If data_offset and data_length of smb_direct_data_transfer struct are invalid, out of bounds issue could happen. This patch validate data_offset and data_length field in recv_done. Cc: stable@vger.kernel.org Fixes: 2ea086e35c3d ("ksmbd: add buffer validation for smb direct") Reviewed-by: Stefan Metzmacher Reported-by: Luigino Camastra, Aisle Research Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index cc4322bfa1d6..d52f37578276 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_length; + unsigned int data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < @@ -565,14 +565,15 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } data_length = le32_to_cpu(data_transfer->data_length); - if (data_length) { - if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + - (u64)data_length) { - put_recvmsg(t, recvmsg); - smb_direct_disconnect_rdma_connection(t); - return; - } + data_offset = le32_to_cpu(data_transfer->data_offset); + if (wc->byte_len < data_offset || + wc->byte_len < (u64)data_offset + data_length) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } + if (data_length) { if (t->full_packet_received) recvmsg->first_segment = true; From e1868ba37fd27c6a68e31565402b154beaa65df0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 11 Sep 2025 10:05:23 +0900 Subject: [PATCH 318/450] ksmbd: smbdirect: verify remaining_data_length respects max_fragmented_recv_size This is inspired by the check for data_offset + data_length. Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: stable@vger.kernel.org Fixes: 2ea086e35c3d ("ksmbd: add buffer validation for smb direct") Acked-by: Namjae Jeon Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/server/transport_rdma.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index d52f37578276..6550bd9f002c 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -554,7 +554,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - unsigned int data_offset, data_length; + u32 remaining_data_length, data_offset, data_length; int avail_recvmsg_count, receive_credits; if (wc->byte_len < @@ -564,6 +564,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); data_length = le32_to_cpu(data_transfer->data_length); data_offset = le32_to_cpu(data_transfer->data_offset); if (wc->byte_len < data_offset || @@ -572,6 +573,14 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); return; } + if (remaining_data_length > t->max_fragmented_recv_size || + data_length > t->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > + (u64)t->max_fragmented_recv_size) { + put_recvmsg(t, recvmsg); + smb_direct_disconnect_rdma_connection(t); + return; + } if (data_length) { if (t->full_packet_received) From b62fd63ade7cb573b114972ef8f9fa505be8d74a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 3 Sep 2025 16:53:21 +0100 Subject: [PATCH 319/450] btrfs: fix invalid extref key setup when replaying dentry The offset for an extref item's key is not the object ID of the parent dir, otherwise we would not need the extref item and would use plain ref items. Instead the offset is the result of a hash computation that uses the object ID of the parent dir and the name associated to the entry. So fix this by setting the key offset at replay_one_name() to be the result of calling btrfs_extref_hash(). Fixes: 725af92a6251 ("btrfs: Open-code name_in_log_ref in replay_one_name") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d5d90845ca9..7a63afedd01e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1964,7 +1964,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; - search_key.offset = key->objectid; + search_key.offset = btrfs_extref_hash(key->objectid, name.name, name.len); ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; From 5b8d2964754102323ca24495ba94892426284e3a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 5 Sep 2025 15:54:43 +0200 Subject: [PATCH 320/450] btrfs: zoned: fix incorrect ASSERT in btrfs_zoned_reserve_data_reloc_bg() When moving a block-group to the dedicated data relocation space-info in btrfs_zoned_reserve_data_reloc_bg() it is asserted that the newly created block group for data relocation does not contain any zone_unusable bytes. But on disks with zone_capacity < zone_size, the difference between zone_size and zone_capacity is accounted as zone_unusable. Instead of asserting that the block-group does not contain any zone_unusable bytes, remove them from the block-groups total size. Reported-by: Yi Zhang Link: https://lore.kernel.org/linux-block/CAHj4cs8-cS2E+-xQ-d2Bj6vMJZ+CwT_cbdWBTju4BV35LsvEYw@mail.gmail.com/ Fixes: daa0fde322350 ("btrfs: zoned: fix data relocation block group reservation") Reviewed-by: Naohiro Aota Tested-by: Yi Zhang Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ea662036f441..efc2a81f50e5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2582,9 +2582,9 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); space_info->total_bytes -= bg->length; space_info->disk_total -= bg->length * factor; + space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ ASSERT(bg->used == 0); - ASSERT(bg->zone_unusable == 0); /* No super block in a block group on the zoned setup. */ ASSERT(bg->bytes_super == 0); spin_unlock(&space_info->lock); From 8679d2687c351824d08cf1f0e86f3b65f22a00fe Mon Sep 17 00:00:00 2001 From: austinchang Date: Thu, 11 Sep 2025 06:06:29 +0000 Subject: [PATCH 321/450] btrfs: initialize inode::file_extent_tree after i_mode has been set btrfs_init_file_extent_tree() uses S_ISREG() to determine if the file is a regular file. In the beginning of btrfs_read_locked_inode(), the i_mode hasn't been read from inode item, then file_extent_tree won't be used at all in volumes without NO_HOLES. Fix this by calling btrfs_init_file_extent_tree() after i_mode is initialized in btrfs_read_locked_inode(). Fixes: 3d7db6e8bd22e6 ("btrfs: don't allocate file extent tree for non regular files") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Filipe Manana Signed-off-by: austinchang Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 --- fs/btrfs/inode.c | 11 +++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0f8d8e275143..c0c1ddd46b67 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1843,7 +1843,6 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; struct inode *vfs_inode = &inode->vfs_inode; @@ -1864,8 +1863,6 @@ int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e7218e78bff4..18db1053cdf0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3885,10 +3885,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(inode); - if (ret) - goto out; - ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; @@ -3920,8 +3916,6 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(inode, 0, - round_up(i_size_read(vfs_inode), fs_info->sectorsize)); inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); @@ -3953,6 +3947,11 @@ static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path btrfs_set_inode_mapping_order(inode); cache_index: + ret = btrfs_init_file_extent_tree(inode); + if (ret) + goto out; + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any From 80eb65ccf6f72dc37b972583fe71cd8a50ff7e51 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 8 Sep 2025 12:51:11 +0100 Subject: [PATCH 322/450] btrfs: annotate block group access with data_race() when sorting for reclaim When sorting the block group list for reclaim we are using a block group's used bytes counter without taking the block group's spinlock, so we can race with a concurrent task updating it (at btrfs_update_block_group()), which makes tools like KCSAN unhappy and report a race. Since the sorting is not strictly needed from a functional perspective and such races should rarely cause any ordering changes (only load/store tearing could cause them), not to mention that after the sorting the ordering may no longer be accurate due to concurrent allocations and deallocations of extents in a block group, annotate the accesses to the used counter with data_race() to silence KCSAN and similar tools. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9bf282d2453c..499a9edf0ca3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1795,7 +1795,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) From 03ee64b5e525c40e9bc723885c6b0b9c6188b55b Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 4 Aug 2025 12:45:19 -0700 Subject: [PATCH 323/450] rv: Support systems with time64-only syscalls Some systems (like 32-bit RISC-V) only have the 64-bit time_t versions of syscalls. So handle the 32-bit time_t version of those being undefined. Fixes: f74f8bb246cf ("rv: Add rtapp_sleep monitor") Closes: https://lore.kernel.org/oe-kbuild-all/202508160204.SsFyNfo6-lkp@intel.com Signed-off-by: Palmer Dabbelt Acked-by: Nam Cao Link: https://lore.kernel.org/r/20250804194518.97620-2-palmer@dabbelt.com Signed-off-by: Gabriele Monaco --- kernel/trace/rv/monitors/sleep/sleep.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b06907..c1347da69e9d 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif From de090d1ccae1e191af4beb92964591c6e4f31f28 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 6 Aug 2025 14:09:11 +0200 Subject: [PATCH 324/450] rv: Fix wrong type cast in enabled_monitors_next() Argument 'p' of enabled_monitors_next() is not a pointer to struct rv_monitor, it is actually a pointer to the list_head inside struct rv_monitor. Therefore it is wrong to cast 'p' to struct rv_monitor *. This wrong type cast has been there since the beginning. But it still worked because the list_head was the first field in struct rv_monitor_def. This is no longer true since commit 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") moved the list_head, and this wrong type cast became a functional problem. Properly use container_of() instead. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") Signed-off-by: Nam Cao Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/20250806120911.989365-1-namcao@linutronix.de Signed-off-by: Gabriele Monaco --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f4..b341445b8fbd 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; From 3afaff7a0ce97457c8ab46862f2c06603a89962e Mon Sep 17 00:00:00 2001 From: Akhilesh Patil Date: Mon, 11 Aug 2025 17:42:53 +0530 Subject: [PATCH 325/450] include/linux/rv.h: remove redundant include file Remove redundant include to clean up the code. Move all unique include files inside CONFIG_RV as they are only needed when CONFIG_RV is enabled. Arrange include files alphabetically. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202507312017.oyD08TL5-lkp@intel.com/ Signed-off-by: Akhilesh Patil Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/aJneRbHGlNFg7lr9@bhairav-test.ee.iitb.ac.in Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faef..9520aab34bcb 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include -#include - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV -#include -#include #include +#include +#include +#include /* * Deterministic automaton per-object variables. From 9b5096761c184b3923ae45c5e82da31005a765c7 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Wed, 3 Sep 2025 14:51:12 +0800 Subject: [PATCH 326/450] rv: Fix missing mutex unlock in rv_register_monitor() If create_monitor_dir() fails, the function returns directly without releasing rv_interface_lock. This leaves the mutex locked and causes subsequent monitor registration attempts to deadlock. Fix it by making the error path jump to out_unlock, ensuring that the mutex is always released before returning. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") Signed-off-by: Zhen Ni Reviewed-by: Gabriele Monaco Reviewed-by: Nam Cao Link: https://lore.kernel.org/r/20250903065112.1878330-1-zhen.ni@easystack.cn Signed-off-by: Gabriele Monaco --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index b341445b8fbd..48338520376f 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) From 19c839a98c731169f06d32e7c9e00c78a0086ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Szymanski?= Date: Fri, 12 Sep 2025 22:18:50 +0200 Subject: [PATCH 327/450] gpiolib: acpi: initialize acpi_gpio_info struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 7c010d463372 ("gpiolib: acpi: Make sure we fill struct acpi_gpio_info"), uninitialized acpi_gpio_info struct are passed to __acpi_find_gpio() and later in the call stack info->quirks is used in acpi_populate_gpio_lookup. This breaks the i2c_hid_cpi driver: [ 58.122916] i2c_hid_acpi i2c-UNIW0001:00: HID over i2c has not been provided an Int IRQ [ 58.123097] i2c_hid_acpi i2c-UNIW0001:00: probe with driver i2c_hid_acpi failed with error -22 Fix this by initializing the acpi_gpio_info pass to __acpi_find_gpio() Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220388 Fixes: 7c010d463372 ("gpiolib: acpi: Make sure we fill struct acpi_gpio_info") Signed-off-by: Sébastien Szymanski Tested-by: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Mika Westerberg Tested-By: Calvin Owens Cc: stable@vger.kernel.org Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpiolib-acpi-core.c b/drivers/gpio/gpiolib-acpi-core.c index e81a29902bb2..284e762d92c4 100644 --- a/drivers/gpio/gpiolib-acpi-core.c +++ b/drivers/gpio/gpiolib-acpi-core.c @@ -942,7 +942,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, { struct acpi_device *adev = to_acpi_device_node(fwnode); bool can_fallback = acpi_can_fallback_to_crs(adev, con_id); - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; int ret; @@ -999,7 +999,7 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int ret; for (i = 0, idx = 0; idx <= index; i++) { - struct acpi_gpio_info info; + struct acpi_gpio_info info = {}; struct gpio_desc *desc; /* Ignore -EPROBE_DEFER, it only matters if idx matches */ From f205ed23f0687ea8b14c140e0af9643eed683691 Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:15:37 +0300 Subject: [PATCH 328/450] ALSA: hda: cs35l41: Support Lenovo Thinkbook 13x Gen 5 This laptop does not contain _DSD so needs to be supported using the configuration table. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/cs35l41_hda_property.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c index d8249d997c2a..16d5ea77192f 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda_property.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda_property.c @@ -135,6 +135,8 @@ static const struct cs35l41_config cs35l41_config_table[] = { { "17AA38C8", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, { "17AA38F9", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, { "17AA38FA", 2, EXTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, 0, 0 }, 0, 2, -1, 0, 0, 0 }, + { "17AA3929", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, + { "17AA392B", 4, INTERNAL, { CS35L41_RIGHT, CS35L41_LEFT, CS35L41_RIGHT, CS35L41_LEFT }, 0, 2, -1, 1000, 4500, 24 }, {} }; @@ -558,6 +560,8 @@ static const struct cs35l41_prop_model cs35l41_prop_model_table[] = { { "CSC3551", "17AA38C8", generic_dsd_config }, { "CSC3551", "17AA38F9", generic_dsd_config }, { "CSC3551", "17AA38FA", generic_dsd_config }, + { "CSC3551", "17AA3929", generic_dsd_config }, + { "CSC3551", "17AA392B", generic_dsd_config }, {} }; From c1d31894d892a88454eee6530c99f3c4fcbc9397 Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:17:38 +0300 Subject: [PATCH 329/450] ALSA: hda/realtek: Support Lenovo Thinkbook 13x Gen 5 The laptop does not contain valid _DSD for these amps, so requires entries into the CS35L41 configuration table to function correctly. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 59cc01079fee..c86ba6ae4ef8 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7100,6 +7100,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC), SND_PCI_QUIRK(0x17aa, 0x391f, "Yoga S990-16 pro Quad YC Quad", ALC287_FIXUP_TXNW2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3920, "Yoga S990-16 pro Quad VECO Quad", ALC287_FIXUP_TXNW2781_I2C), + SND_PCI_QUIRK(0x17aa, 0x3929, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x392b, "Thinkbook 13x Gen 5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), From d99c203034989c3652cc8e7330ba5750fc74c04f Mon Sep 17 00:00:00 2001 From: Bou-Saan Che Date: Sun, 14 Sep 2025 22:17:50 +0300 Subject: [PATCH 330/450] ALSA: hda/realtek: Fix volume control on Lenovo Thinkbook 13x Gen 4 The issue was caused by incorrect configuration in the driver, which prevented proper volume control on certain systems. Signed-off-by: Bou-Saan Che Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index c86ba6ae4ef8..618ec00d3f98 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7078,8 +7078,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38be, "Yoga S980-14.5 proX YC Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38bf, "Yoga S980-14.5 proX LX Dual", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38c3, "Y980 DUAL", ALC287_FIXUP_TAS2781_I2C), - SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), - SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_CS35L41_I2C_4), + SND_PCI_QUIRK(0x17aa, 0x38c7, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38c8, "Thinkbook 13x Gen 4", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38cb, "Y790 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38cd, "Y790 VECO DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38d2, "Lenovo Yoga 9 14IMH9", ALC287_FIXUP_YOGA9_14IMH9_BASS_SPK_PIN), From a38108a23ab558b834d71d542d32c05ab0fb64d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Sep 2025 10:30:52 +0300 Subject: [PATCH 331/450] wifi: iwlwifi: pcie: fix byte count table for some devices In my previous fix for this condition, I erroneously listed 9000 instead of 7000 family, when 7000/8000 were already using iwlmvm. Thus the condition ended up wrong, causing the issue I had fixed for older devices to suddenly appear on 7000/8000 family devices. Correct the condition accordingly. Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/r/20250909165811.10729-1-00107082@163.com/ Fixes: 586e3cb33ba6 ("wifi: iwlwifi: fix byte count table for old devices") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250915102743.777aaafbcc6c.I84404edfdfbf400501f6fb06def5b86c501da198@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c index d912e709a92c..bb03dad4a300 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c @@ -2092,7 +2092,7 @@ static void iwl_txq_gen1_update_byte_cnt_tbl(struct iwl_trans *trans, break; } - if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_9000 && + if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_7000 && trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210) len = DIV_ROUND_UP(len, 4); From c7a321e4e90e1bd072697bc050b9426e04cffc6a Mon Sep 17 00:00:00 2001 From: Mohammad Rafi Shaik Date: Sun, 14 Sep 2025 18:45:49 +0530 Subject: [PATCH 332/450] ASoC: qcom: sc8280xp: Fix sound card driver name match data for QCS8275 The QCS8275 board is based on Qualcomm's QCS8300 SoC family, and all supported firmware files are located in the qcs8300 directory. The sound topology and ALSA UCM configuration files have also been migrated from the qcs8275 directory to the actual SoC qcs8300 directory in linux-firmware. With the current setup, the sound topology fails to load, resulting in sound card registration failure. This patch updates the driver match data to use the correct driver name qcs8300 for the qcs8275-sndcard, ensuring that the sound card driver correctly loads the sound topology and ALSA UCM configuration files from the qcs8300 directory. Fixes: 34d340d48e595 ("ASoC: qcom: sc8280xp: Add support for QCS8275") Cc: stable@vger.kernel.org Signed-off-by: Mohammad Rafi Shaik Reviewed-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250914131549.1198740-1-mohammad.rafi.shaik@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/qcom/sc8280xp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/qcom/sc8280xp.c b/sound/soc/qcom/sc8280xp.c index 73f9f82c4e25..db48168b7d3f 100644 --- a/sound/soc/qcom/sc8280xp.c +++ b/sound/soc/qcom/sc8280xp.c @@ -186,7 +186,7 @@ static int sc8280xp_platform_probe(struct platform_device *pdev) static const struct of_device_id snd_sc8280xp_dt_match[] = { {.compatible = "qcom,qcm6490-idp-sndcard", "qcm6490"}, {.compatible = "qcom,qcs6490-rb3gen2-sndcard", "qcs6490"}, - {.compatible = "qcom,qcs8275-sndcard", "qcs8275"}, + {.compatible = "qcom,qcs8275-sndcard", "qcs8300"}, {.compatible = "qcom,qcs9075-sndcard", "qcs9075"}, {.compatible = "qcom,qcs9100-sndcard", "qcs9100"}, {.compatible = "qcom,sc8280xp-sndcard", "sc8280xp"}, From 73caf2bcf3f0a3843c091b78b0711e356f8a2ef9 Mon Sep 17 00:00:00 2001 From: Mac Chiang Date: Mon, 15 Sep 2025 10:54:56 +0800 Subject: [PATCH 333/450] ASoC: Intel: sof_sdw: use PRODUCT_FAMILY for Fatcat series PRODUCT_NAME is machine-specific. Use PRODUCT_FAMILY to ensure the machine quirk is applied with consistent audio configurations across Fatcat series products. Signed-off-by: Mac Chiang Reviewed-by: Kai Vehmanen Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250915025456.1154200-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index f997b2dc221b..28f03a5f29f7 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -761,7 +761,7 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = { .callback = sof_sdw_quirk_cb, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Google"), - DMI_MATCH(DMI_PRODUCT_NAME, "Fatcat"), + DMI_MATCH(DMI_PRODUCT_FAMILY, "Google_Fatcat"), }, .driver_data = (void *)(SOC_SDW_PCH_DMIC | SOF_BT_OFFLOAD_SSP(2) | From d7871f400cad1da376f1d7724209a1c49226c456 Mon Sep 17 00:00:00 2001 From: Venkata Prasad Potturu Date: Wed, 10 Sep 2025 22:43:59 +0530 Subject: [PATCH 334/450] ASoC: amd: acp: Fix incorrect retrival of acp_chip_info Use dev_get_drvdata(dev->parent) instead of dev_get_platdata(dev) to correctly obtain acp_chip_info members in the acp I2S driver. Previously, some members were not updated properly due to incorrect data access, which could potentially lead to null pointer dereferences. This issue was missed in the earlier commit ("ASoC: amd: acp: Fix NULL pointer deref in acp_i2s_set_tdm_slot"), which only addressed set_tdm_slot(). This change ensures that all relevant functions correctly retrieve acp_chip_info, preventing further null pointer dereference issues. Fixes: e3933683b25e ("ASoC: amd: acp: Remove redundant acp_dev_data structure") Signed-off-by: Venkata Prasad Potturu Reviewed-by: Cezary Rojewski Link: https://patch.msgid.link/20250910171419.3682468-1-venkataprasad.potturu@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-i2s.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sound/soc/amd/acp/acp-i2s.c b/sound/soc/amd/acp/acp-i2s.c index 617690362ad7..4ba0a66981ea 100644 --- a/sound/soc/amd/acp/acp-i2s.c +++ b/sound/soc/amd/acp/acp-i2s.c @@ -73,7 +73,7 @@ static int acp_i2s_set_fmt(struct snd_soc_dai *cpu_dai, unsigned int fmt) { struct device *dev = cpu_dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); int mode; mode = fmt & SND_SOC_DAIFMT_FORMAT_MASK; @@ -199,7 +199,7 @@ static int acp_i2s_hwparams(struct snd_pcm_substream *substream, struct snd_pcm_ u32 reg_val, fmt_reg, tdm_fmt; u32 lrclk_div_val, bclk_div_val; - chip = dev_get_platdata(dev); + chip = dev_get_drvdata(dev->parent); rsrc = chip->rsrc; /* These values are as per Hardware Spec */ @@ -386,7 +386,7 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; u32 val, period_bytes, reg_val, ier_val, water_val, buf_size, buf_reg; @@ -516,14 +516,13 @@ static int acp_i2s_trigger(struct snd_pcm_substream *substream, int cmd, struct static int acp_i2s_prepare(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; struct acp_stream *stream = substream->runtime->private_data; u32 reg_dma_size = 0, reg_fifo_size = 0, reg_fifo_addr = 0; u32 phy_addr = 0, acp_fifo_addr = 0, ext_int_ctrl; unsigned int dir = substream->stream; - chip = dev_get_platdata(dev); switch (dai->driver->id) { case I2S_SP_INSTANCE: if (dir == SNDRV_PCM_STREAM_PLAYBACK) { @@ -632,7 +631,7 @@ static int acp_i2s_startup(struct snd_pcm_substream *substream, struct snd_soc_d { struct acp_stream *stream = substream->runtime->private_data; struct device *dev = dai->component->dev; - struct acp_chip_info *chip = dev_get_platdata(dev); + struct acp_chip_info *chip = dev_get_drvdata(dev->parent); struct acp_resource *rsrc = chip->rsrc; unsigned int dir = substream->stream; unsigned int irq_bit = 0; From cc648f4dde2ffbb74b9c1626e3eaaac89c6fbe16 Mon Sep 17 00:00:00 2001 From: Balamurugan C Date: Mon, 15 Sep 2025 10:56:54 +0800 Subject: [PATCH 335/450] ASoC: Intel: PTL: Add entry for HDMI-In capture support to non-I2S codec boards. Adding HDMI-In capture support for the PTL products which doesn't have onboard I2S codec. But need to support HDMI-In capture via I2S and audio playback through HDMI/DP monitor. Signed-off-by: Balamurugan C Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250915025655.1154279-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_ssp_amp.c | 6 ++++++ sound/soc/intel/common/soc-acpi-intel-ptl-match.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/sound/soc/intel/boards/sof_ssp_amp.c b/sound/soc/intel/boards/sof_ssp_amp.c index 48ee5353bdf1..729c0cd7c19c 100644 --- a/sound/soc/intel/boards/sof_ssp_amp.c +++ b/sound/soc/intel/boards/sof_ssp_amp.c @@ -216,6 +216,12 @@ static const struct platform_device_id board_ids[] = { /* SSP 0 and SSP 2 are used for HDMI IN */ SOF_HDMI_PLAYBACK_PRESENT), }, + { + .name = "ptl_lt6911_hdmi_ssp", + .driver_data = (kernel_ulong_t)(SOF_SSP_MASK_HDMI_CAPTURE(0x5) | + /* SSP 0 and SSP 2 are used for HDMI IN */ + SOF_HDMI_PLAYBACK_PRESENT), + }, { } }; MODULE_DEVICE_TABLE(platform, board_ids); diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c index e292701dfcfe..3c8b10e21ceb 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c @@ -61,6 +61,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_machines[] = { SND_SOC_ACPI_TPLG_INTEL_SSP_MSB | SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER, }, + /* place amp-only boards in the end of table */ + { + .id = "INTC10B0", + .drv_name = "ptl_lt6911_hdmi_ssp", + .sof_tplg_filename = "sof-ptl-hdmi-ssp02.tplg", + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_ptl_machines); From 013e484dbd687a9174acf8f4450217bdb86ad788 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 19 Aug 2025 15:39:51 +0000 Subject: [PATCH 336/450] drm/xe/tile: Release kobject for the failure path Call kobject_put() for the failure path to release the kobject v2: remove extra newline. (Matt) Fixes: e3d0839aa501 ("drm/xe/tile: Abort driver load for sysfs creation failure") Cc: Himal Prasad Ghimiray Reviewed-by: Matthew Brost Signed-off-by: Shuicheng Lin Link: https://lore.kernel.org/r/20250819153950.2973344-2-shuicheng.lin@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit b98775bca99511cc22ab459a2de646cd2fa7241f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_tile_sysfs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c index b804234a6551..9e1236a9ec67 100644 --- a/drivers/gpu/drm/xe/xe_tile_sysfs.c +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c @@ -44,16 +44,18 @@ int xe_tile_sysfs_init(struct xe_tile *tile) kt->tile = tile; err = kobject_add(&kt->base, &dev->kobj, "tile%d", tile->id); - if (err) { - kobject_put(&kt->base); - return err; - } + if (err) + goto err_object; tile->sysfs = &kt->base; err = xe_vram_freq_sysfs_init(tile); if (err) - return err; + goto err_object; return devm_add_action_or_reset(xe->drm.dev, tile_sysfs_fini, tile); + +err_object: + kobject_put(&kt->base); + return err; } From fef8b64e48e836344574b85132a1c317f4260022 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 11 Sep 2025 00:24:39 +0200 Subject: [PATCH 337/450] drm/xe/pf: Drop rounddown_pow_of_two fair LMEM limitation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This effectively reverts commit 4c3fe5eae46b ("drm/xe/pf: Limit fair VF LMEM provisioning") since we don't need it any more after non-contig VRAM allocations were fixed. This allows larger LMEM auto-provisioning for VFs, so instead: [ ] GT0: PF: LMEM available(14096M) fair(1 x 8192M) [ ] GT0: PF: VF1 provisioned with 8589934592 (8.00 GiB) LMEM or [ ] GT0: PF: LMEM available(14096M) fair(2 x 4096M) [ ] GT0: PF: VF1..VF2 provisioned with 4294967296 (4.00 GiB) LMEM we may get: [ ] GT0: PF: LMEM available(14096M) fair(1 x 14096M) [ ] GT0: PF: VF1 provisioned with 14780727296 (13.8 GiB) LMEM and [ ] GT0: PF: LMEM available(14096M) fair(2 x 7048M) [ ] GT0: PF: VF1..VF2 provisioned with 7390363648 (6.88 GiB) LMEM Fixes: 1e32ffbc9dc8 ("drm/xe/sriov: support non-contig VRAM provisioning") Signed-off-by: Michal Wajdeczko Reviewed-by: Piotr Piórkowski Link: https://lore.kernel.org/r/20250910222439.32869-1-michal.wajdeczko@intel.com (cherry picked from commit 95c1cfa306087142989bff34ea0e05dcd95ddc58) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c index 494909f74eb2..d84831a03610 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c @@ -1632,7 +1632,6 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs) u64 fair; fair = div_u64(available, num_vfs); - fair = rounddown_pow_of_two(fair); /* XXX: ttm_vram_mgr & drm_buddy limitation */ fair = ALIGN_DOWN(fair, alignment); #ifdef MAX_FAIR_LMEM fair = min_t(u64, MAX_FAIR_LMEM, fair); From 7f830e126dc357fc086905ce9730140fd4528d66 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 15 Sep 2025 11:04:12 -0500 Subject: [PATCH 338/450] x86/sev: Guard sev_evict_cache() with CONFIG_AMD_MEM_ENCRYPT The sev_evict_cache() is guest-related code and should be guarded by CONFIG_AMD_MEM_ENCRYPT, not CONFIG_KVM_AMD_SEV. CONFIG_AMD_MEM_ENCRYPT=y is required for a guest to run properly as an SEV-SNP guest, but a guest kernel built with CONFIG_KVM_AMD_SEV=n would get the stub function of sev_evict_cache() instead of the version that performs the actual eviction. Move the function declarations under the appropriate #ifdef. Fixes: 7b306dfa326f ("x86/sev: Evict cache lines during SNP memory validation") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Cc: stable@kernel.org # 6.16.x Link: https://lore.kernel.org/r/70e38f2c4a549063de54052c9f64929705313526.1757708959.git.thomas.lendacky@amd.com --- arch/x86/include/asm/sev.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 02236962fdb1..465b19fd1a2d 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -562,6 +562,24 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, extern struct ghcb *boot_ghcb; +static inline void sev_evict_cache(void *va, int npages) +{ + volatile u8 val __always_unused; + u8 *bytes = va; + int page_idx; + + /* + * For SEV guests, a read from the first/last cache-lines of a 4K page + * using the guest key is sufficient to cause a flush of all cache-lines + * associated with that 4K page without incurring all the overhead of a + * full CLFLUSH sequence. + */ + for (page_idx = 0; page_idx < npages; page_idx++) { + val = bytes[page_idx * PAGE_SIZE]; + val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; + } +} + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define snp_vmpl 0 @@ -605,6 +623,7 @@ static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } +static inline void sev_evict_cache(void *va, int npages) {} #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -619,24 +638,6 @@ int rmp_make_shared(u64 pfn, enum pg_level level); void snp_leak_pages(u64 pfn, unsigned int npages); void kdump_sev_callback(void); void snp_fixup_e820_tables(void); - -static inline void sev_evict_cache(void *va, int npages) -{ - volatile u8 val __always_unused; - u8 *bytes = va; - int page_idx; - - /* - * For SEV guests, a read from the first/last cache-lines of a 4K page - * using the guest key is sufficient to cause a flush of all cache-lines - * associated with that 4K page without incurring all the overhead of a - * full CLFLUSH sequence. - */ - for (page_idx = 0; page_idx < npages; page_idx++) { - val = bytes[page_idx * PAGE_SIZE]; - val = bytes[page_idx * PAGE_SIZE + PAGE_SIZE - 1]; - } -} #else static inline bool snp_probe_rmptable_info(void) { return false; } static inline int snp_rmptable_init(void) { return -ENOSYS; } @@ -652,7 +653,6 @@ static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} static inline void kdump_sev_callback(void) { } static inline void snp_fixup_e820_tables(void) {} -static inline void sev_evict_cache(void *va, int npages) {} #endif #endif From cd4ea81be3eb94047ad023c631afd9bd6c295400 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 12 Sep 2025 02:06:09 +0200 Subject: [PATCH 339/450] io_uring/io-wq: fix `max_workers` breakage and `nr_workers` underflow Commit 88e6c42e40de ("io_uring/io-wq: add check free worker before create new worker") reused the variable `do_create` for something else, abusing it for the free worker check. This caused the value to effectively always be `true` at the time `nr_workers < max_workers` was checked, but it should really be `false`. This means the `max_workers` setting was ignored, and worse: if the limit had already been reached, incrementing `nr_workers` was skipped even though another worker would be created. When later lots of workers exit, the `nr_workers` field could easily underflow, making the problem worse because more and more workers would be created without incrementing `nr_workers`. The simple solution is to use a different variable for the free worker check instead of using one variable for two different things. Cc: stable@vger.kernel.org Fixes: 88e6c42e40de ("io_uring/io-wq: add check free worker before create new worker") Signed-off-by: Max Kellermann Reviewed-by: Fengnan Chang Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 17dfaa0395c4..1d03b2fc4b25 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -352,16 +352,16 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wq_acct *acct; - bool do_create = false; + bool activated_free_worker, do_create = false; worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; rcu_read_lock(); - do_create = !io_acct_activate_free_worker(acct); + activated_free_worker = io_acct_activate_free_worker(acct); rcu_read_unlock(); - if (!do_create) + if (activated_free_worker) goto no_need_create; raw_spin_lock(&acct->workers_lock); From 20c9ccffccd61b37325a0519fb6d485caeecf7fa Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sun, 14 Sep 2025 11:18:08 -0700 Subject: [PATCH 340/450] perf maps: Ensure kmap is set up for all inserts __maps__fixup_overlap_and_insert may split or directly insert a map, when doing this the map may need to have a kmap set up for the sake of the kmaps. The missing kmap set up fails the check_invariants test in maps, later "Internal error" reports from map__kmap and ultimately causes segfaults. Similar fixes were added in commit e0e4e0b8b7fa ("perf maps: Add missing map__set_kmap_maps() when replacing a kernel map") and commit 25d9c0301d36 ("perf maps: Set the kmaps for newly created/added kernel maps") but they missed cases. To try to reduce the risk of this, update the kmap directly following any manual insert. This identified another problem in maps__copy_from. Fixes: e0e4e0b8b7fa ("perf maps: Add missing map__set_kmap_maps() when replacing a kernel map") Fixes: 25d9c0301d36 ("perf maps: Set the kmaps for newly created/added kernel maps") Signed-off-by: Ian Rogers Signed-off-by: Namhyung Kim --- tools/perf/util/maps.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c index 85b2a93a59ac..779f6230130a 100644 --- a/tools/perf/util/maps.c +++ b/tools/perf/util/maps.c @@ -477,6 +477,7 @@ static int __maps__insert(struct maps *maps, struct map *new) } /* Insert the value at the end. */ maps_by_address[nr_maps] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) maps_by_name[nr_maps] = map__get(new); @@ -502,8 +503,6 @@ static int __maps__insert(struct maps *maps, struct map *new) if (map__end(new) < map__start(new)) RC_CHK_ACCESS(maps)->ends_broken = true; - map__set_kmap_maps(new, maps); - return 0; } @@ -891,6 +890,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) if (before) { map__put(maps_by_address[i]); maps_by_address[i] = before; + map__set_kmap_maps(before, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -918,6 +918,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); @@ -942,14 +943,13 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new) */ map__put(maps_by_address[i]); maps_by_address[i] = map__get(new); + map__set_kmap_maps(new, maps); if (maps_by_name) { map__put(maps_by_name[ni]); maps_by_name[ni] = map__get(new); } - map__set_kmap_maps(new, maps); - check_invariants(maps); return err; } @@ -1019,6 +1019,7 @@ int maps__copy_from(struct maps *dest, struct maps *parent) err = unwind__prepare_access(dest, new, NULL); if (!err) { dest_maps_by_address[i] = new; + map__set_kmap_maps(new, dest); if (dest_maps_by_name) dest_maps_by_name[i] = map__get(new); RC_CHK_ACCESS(dest)->nr_maps = i + 1; From dcfd151d3277cc3bc73c94114e360556d47b992d Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 12 Sep 2025 17:04:58 +0530 Subject: [PATCH 341/450] drm/xe/hwmon: Remove type casting Refactor: eliminate type casts by using proper u32 declarations. v2: - Address review comments. (Karthik) v3: - Use the proper u32 type and drop cast. (Lucas De Marchi) - Modify variable when actually using u64 value. - Change r value to reg_value with u32 type. v4: - Remove newline between trailer and Signed-off-by. (Lucas De Marchi) - Change reg_val to val for more user-friendly logging. - Use mul_u32_u32 function since both values are u32. v5: - mul_u32_u32 function with shift. (Lucas De Marchi) Fixes: 7596d839f6228 ("drm/xe/hwmon: Add support to manage power limits though mailbox") Signed-off-by: Mallesh Koujalagi Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250912113458.2815172-1-mallesh.koujalagi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4e1d3b5e6423dc841acd8691d75626b3d3b2b6a8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hwmon.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index c17ed1ae8649..c5b63e10bb91 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -286,7 +286,7 @@ static struct xe_reg xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg */ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *value) { - u64 reg_val = 0, min, max; + u32 reg_val = 0; struct xe_device *xe = hwmon->xe; struct xe_reg rapl_limit, pkg_power_sku; struct xe_mmio *mmio = xe_root_tile_mmio(xe); @@ -294,7 +294,7 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, (u32 *)®_val); + xe_hwmon_pcode_read_power_limit(hwmon, attr, channel, ®_val); } else { rapl_limit = xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel); pkg_power_sku = xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU, channel); @@ -304,19 +304,21 @@ static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, u32 attr, int channe /* Check if PL limits are disabled. */ if (!(reg_val & PWR_LIM_EN)) { *value = PL_DISABLE; - drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%016llx\n", + drm_info(&hwmon->xe->drm, "%s disabled for channel %d, val 0x%08x\n", PWR_ATTR_TO_STR(attr), channel, reg_val); goto unlock; } reg_val = REG_FIELD_GET(PWR_LIM_VAL, reg_val); - *value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power); + *value = mul_u32_u32(reg_val, SF_POWER) >> hwmon->scl_shift_power; /* For platforms with mailbox power limit support clamping would be done by pcode. */ if (!hwmon->xe->info.has_mbx_power_limits) { - reg_val = xe_mmio_read64_2x32(mmio, pkg_power_sku); - min = REG_FIELD_GET(PKG_MIN_PWR, reg_val); - max = REG_FIELD_GET(PKG_MAX_PWR, reg_val); + u64 pkg_pwr, min, max; + + pkg_pwr = xe_mmio_read64_2x32(mmio, pkg_power_sku); + min = REG_FIELD_GET(PKG_MIN_PWR, pkg_pwr); + max = REG_FIELD_GET(PKG_MAX_PWR, pkg_pwr); min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power); max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power); if (min && max) @@ -493,8 +495,8 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at { struct xe_hwmon *hwmon = dev_get_drvdata(dev); struct xe_mmio *mmio = xe_root_tile_mmio(hwmon->xe); - u32 x, y, x_w = 2; /* 2 bits */ - u64 r, tau4, out; + u32 reg_val, x, y, x_w = 2; /* 2 bits */ + u64 tau4, out; int channel = (to_sensor_dev_attr(attr)->index % 2) ? CHANNEL_PKG : CHANNEL_CARD; u32 power_attr = (to_sensor_dev_attr(attr)->index > 1) ? PL2_HWMON_ATTR : PL1_HWMON_ATTR; @@ -505,23 +507,24 @@ xe_hwmon_power_max_interval_show(struct device *dev, struct device_attribute *at mutex_lock(&hwmon->hwmon_lock); if (hwmon->xe->info.has_mbx_power_limits) { - ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, (u32 *)&r); + ret = xe_hwmon_pcode_read_power_limit(hwmon, power_attr, channel, ®_val); if (ret) { drm_err(&hwmon->xe->drm, - "power interval read fail, ch %d, attr %d, r 0%llx, ret %d\n", - channel, power_attr, r, ret); - r = 0; + "power interval read fail, ch %d, attr %d, val 0x%08x, ret %d\n", + channel, power_attr, reg_val, ret); + reg_val = 0; } } else { - r = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, channel)); + reg_val = xe_mmio_read32(mmio, xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT, + channel)); } mutex_unlock(&hwmon->hwmon_lock); xe_pm_runtime_put(hwmon->xe); - x = REG_FIELD_GET(PWR_LIM_TIME_X, r); - y = REG_FIELD_GET(PWR_LIM_TIME_Y, r); + x = REG_FIELD_GET(PWR_LIM_TIME_X, reg_val); + y = REG_FIELD_GET(PWR_LIM_TIME_Y, reg_val); /* * tau = (1 + (x / 4)) * power(2,y), x = bits(23:22), y = bits(21:17) From 7926ba2143d8fef40bb940232818c7363e33598c Mon Sep 17 00:00:00 2001 From: Nitin Gote Date: Thu, 11 Sep 2025 10:58:23 +0530 Subject: [PATCH 342/450] drm/xe: defer free of NVM auxiliary container to device release callback Do not kfree the intel_dg_nvm_dev in xe_nvm_fini() right after auxiliary_device_delete/uninit. The auxiliary_device embeds the device/kobject (and its name); freeing it too early can race with asynchronous device_del/udev processing and cause a use-after-free. Signed-off-by: Nitin Gote Fixes: c28bfb107dac ("drm/xe/nvm: add on-die non-volatile memory device") Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250911052823.226696-1-nitin.r.gote@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit d4c3ed963e41d488695cf91068eabb8eb9f538ec) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_nvm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_nvm.c b/drivers/gpu/drm/xe/xe_nvm.c index 61b0a1531a53..2cfe9eb67391 100644 --- a/drivers/gpu/drm/xe/xe_nvm.c +++ b/drivers/gpu/drm/xe/xe_nvm.c @@ -35,6 +35,10 @@ static const struct intel_dg_nvm_region regions[INTEL_DG_NVM_REGIONS] = { static void xe_nvm_release_dev(struct device *dev) { + struct auxiliary_device *aux = container_of(dev, struct auxiliary_device, dev); + struct intel_dg_nvm_dev *nvm = container_of(aux, struct intel_dg_nvm_dev, aux_dev); + + kfree(nvm); } static bool xe_nvm_non_posted_erase(struct xe_device *xe) @@ -162,6 +166,5 @@ void xe_nvm_fini(struct xe_device *xe) auxiliary_device_delete(&nvm->aux_dev); auxiliary_device_uninit(&nvm->aux_dev); - kfree(nvm); xe->nvm = NULL; } From a10f910c77f280327b481e77eab909934ec508f0 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Wed, 9 Jul 2025 10:54:38 +0200 Subject: [PATCH 343/450] drm: bridge: anx7625: Fix NULL pointer dereference with early IRQ If the interrupt occurs before resource initialization is complete, the interrupt handler/worker may access uninitialized data such as the I2C tcpc_client device, potentially leading to NULL pointer dereference. Signed-off-by: Loic Poulain Fixes: 8bdfc5dae4e3 ("drm/bridge: anx7625: Add anx7625 MIPI DSI/DPI to DP") Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250709085438.56188-1-loic.poulain@oss.qualcomm.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/analogix/anx7625.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c b/drivers/gpu/drm/bridge/analogix/anx7625.c index c0ad8f59e483..8b3304dedcd9 100644 --- a/drivers/gpu/drm/bridge/analogix/anx7625.c +++ b/drivers/gpu/drm/bridge/analogix/anx7625.c @@ -2677,7 +2677,7 @@ static int anx7625_i2c_probe(struct i2c_client *client) ret = devm_request_threaded_irq(dev, platform->pdata.intp_irq, NULL, anx7625_intr_hpd_isr, IRQF_TRIGGER_FALLING | - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "anx7625-intp", platform); if (ret) { DRM_DEV_ERROR(dev, "fail to request irq\n"); @@ -2746,8 +2746,10 @@ static int anx7625_i2c_probe(struct i2c_client *client) } /* Add work function */ - if (platform->pdata.intp_irq) + if (platform->pdata.intp_irq) { + enable_irq(platform->pdata.intp_irq); queue_work(platform->workqueue, &platform->work); + } if (platform->pdata.audio_en) anx7625_register_audio(dev, platform); From c1b6b8c7706354b73196649c46b5e6d4d61c2f5c Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Wed, 10 Sep 2025 12:27:05 +0530 Subject: [PATCH 344/450] drm/amdgpu/gfx11: Add Cleaner Shader Support for GFX11.0.1/11.0.4 GPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable the cleaner shader for additional GFX11.0.1/11.0.4 series GPUs to ensure data isolation among GPU tasks. The cleaner shader is tasked with clearing the Local Data Store (LDS), Vector General Purpose Registers (VGPRs), and Scalar General Purpose Registers (SGPRs), which helps avoid data leakage and guarantees the accuracy of computational results. This update extends cleaner shader support to GFX11.0.1/11.0.4 GPUs, previously available for GFX11.0.3. It enhances security by clearing GPU memory between processes and maintains a consistent GPU state across KGD and KFD workloads. Cc: Wasee Alam Cc: Mario Sopena-Novales Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 0a71ceb27f88a944c2de2808b67b2f46ac75076b) --- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index c85de8c8f6f5..c37527704d43 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1654,6 +1654,21 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } } break; + case IP_VERSION(11, 0, 1): + case IP_VERSION(11, 0, 4): + adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; + adev->gfx.cleaner_shader_size = sizeof(gfx_11_0_3_cleaner_shader_hex); + if (adev->gfx.pfp_fw_version >= 102 && + adev->gfx.mec_fw_version >= 66 && + adev->mes.fw_version[0] >= 128) { + adev->gfx.enable_cleaner_shader = true; + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); + if (r) { + adev->gfx.enable_cleaner_shader = false; + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); + } + } + break; case IP_VERSION(11, 5, 0): case IP_VERSION(11, 5, 1): adev->gfx.cleaner_shader_ptr = gfx_11_0_3_cleaner_shader_hex; From 29a2f430475357f760679b249f33e7282688e292 Mon Sep 17 00:00:00 2001 From: Ivan Lipski Date: Tue, 2 Sep 2025 16:20:09 -0400 Subject: [PATCH 345/450] drm/amd/display: Allow RX6xxx & RX7700 to invoke amdgpu_irq_get/put [Why&How] As reported on https://gitlab.freedesktop.org/drm/amd/-/issues/3936, SMU hang can occur if the interrupts are not enabled appropriately, causing a vblank timeout. This patch reverts commit 5009628d8509 ("drm/amd/display: Remove unnecessary amdgpu_irq_get/put"), but only for RX6xxx & RX7700 GPUs, on which the issue was observed. This will re-enable interrupts regardless of whether the user space needed it or not. Fixes: 5009628d8509 ("drm/amd/display: Remove unnecessary amdgpu_irq_get/put") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3936 Suggested-by: Sun peng Li Reviewed-by: Sun peng Li Signed-off-by: Ivan Lipski Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 95d168b367aa28a59f94fc690ff76ebf69312c6d) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 4e86370ae705..97d9eba17963 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8717,7 +8717,16 @@ static int amdgpu_dm_encoder_init(struct drm_device *dev, static void manage_dm_interrupts(struct amdgpu_device *adev, struct amdgpu_crtc *acrtc, struct dm_crtc_state *acrtc_state) -{ +{ /* + * We cannot be sure that the frontend index maps to the same + * backend index - some even map to more than one. + * So we have to go through the CRTC to find the right IRQ. + */ + int irq_type = amdgpu_display_crtc_idx_to_irq_type( + adev, + acrtc->crtc_id); + struct drm_device *dev = adev_to_drm(adev); + struct drm_vblank_crtc_config config = {0}; struct dc_crtc_timing *timing; int offdelay; @@ -8770,7 +8779,35 @@ static void manage_dm_interrupts(struct amdgpu_device *adev, drm_crtc_vblank_on_config(&acrtc->base, &config); + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_get.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): + if (amdgpu_irq_get(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get pageflip irq!\n"); +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_get(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot get vline0 irq!\n"); +#endif + } + } else { + /* Allow RX6xxx, RX7700, RX7800 GPUs to call amdgpu_irq_put.*/ + switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { + case IP_VERSION(3, 0, 0): + case IP_VERSION(3, 0, 2): + case IP_VERSION(3, 0, 3): + case IP_VERSION(3, 2, 0): +#if defined(CONFIG_DRM_AMD_SECURE_DISPLAY) + if (amdgpu_irq_put(adev, &adev->vline0_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put vline0 irq!\n"); +#endif + if (amdgpu_irq_put(adev, &adev->pageflip_irq, irq_type)) + drm_err(dev, "DM_IRQ: Cannot put pageflip irq!\n"); + } + drm_crtc_vblank_off(&acrtc->base); } } From 4351ca3fcb3ffecf12631b4996bf085a2dad0db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Thu, 11 Sep 2025 15:33:34 +0200 Subject: [PATCH 346/450] rds: ib: Increment i_fastreg_wrs before bailing out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to increment i_fastreg_wrs before we bail out from rds_ib_post_reg_frmr(). We have a fixed budget of how many FRWR operations that can be outstanding using the dedicated QP used for memory registrations and de-registrations. This budget is enforced by the atomic_t i_fastreg_wrs. If we bail out early in rds_ib_post_reg_frmr(), we will "leak" the possibility of posting an FRWR operation, and if that accumulates, no FRWR operation can be carried out. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Fixes: 3a2886cca703 ("net/rds: Keep track of and wait for FRWR segments in use upon shutdown") Cc: stable@vger.kernel.org Signed-off-by: Håkon Bugge Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250911133336.451212-1-haakon.bugge@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_frmr.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b0022178..bd861191157b 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } From 35ae4e86292ef7dfe4edbb9942955c884e984352 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:34 +0000 Subject: [PATCH 347/450] bonding: set random address only when slaves already exist After commit 5c3bf6cba791 ("bonding: assign random address if device address is same as bond"), bonding will erroneously randomize the MAC address of the first interface added to the bond if fail_over_mac = follow. Correct this by additionally testing for the bond being empty before randomizing the MAC. Fixes: 5c3bf6cba791 ("bonding: assign random address if device address is same as bond") Reported-by: Qiuling Ren Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c88710..8832bc9f107b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2132,6 +2132,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + bond_has_slaves(bond) && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. From 71379e1c95af2c57567fcac24184c94cb7de4cd6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:35 +0000 Subject: [PATCH 348/450] selftests: bonding: add fail_over_mac testing Add a test to check each value of bond fail_over_mac option. Also fix a minor garp_test print issue. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 139 +++++++++++++++++- .../drivers/net/bonding/bond_topo_2d1c.sh | 3 + .../drivers/net/bonding/bond_topo_3d1c.sh | 2 + 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca7..e3f3cc803b56 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,7 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac " lib_dir=$(dirname "$0") @@ -352,8 +353,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +377,140 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" + +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf1..167aa4a4a12a 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b..23a2932301cc 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 From f755be0b1ff429a2ecf709beeb1bcd7abc111c2b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:50 +0200 Subject: [PATCH 349/450] mptcp: propagate shutdown to subflows when possible When the MPTCP DATA FIN have been ACKed, there is no more MPTCP related metadata to exchange, and all subflows can be safely shutdown. Before this patch, the subflows were actually terminated at 'close()' time. That's certainly fine most of the time, but not when the userspace 'shutdown()' a connection, without close()ing it. When doing so, the subflows were staying in LAST_ACK state on one side -- and consequently in FIN_WAIT2 on the other side -- until the 'close()' of the MPTCP socket. Now, when the DATA FIN have been ACKed, all subflows are shutdown. A consequence of this is that the TCP 'FIN' flag can be set earlier now, but the end result is the same. This affects the packetdrill tests looking at the end of the MPTCP connections, but for a good reason. Note that tcp_shutdown() will check the subflow state, so no need to do that again before calling it. Fixes: 3721b9b64676 ("mptcp: Track received DATA_FIN sequence number and add related helpers") Cc: stable@vger.kernel.org Fixes: 16a9a9da1723 ("mptcp: Add helper to process acks of DATA_FIN") Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-1-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fd97b21e9e..5e497a83e967 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: From 14e22b43df25dbd4301351b882486ea38892ae4f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:51 +0200 Subject: [PATCH 350/450] selftests: mptcp: connect: catch IO errors on listen side IO errors were correctly printed to stderr, and propagated up to the main loop for the server side, but the returned value was ignored. As a consequence, the program for the listener side was no longer exiting with an error code in case of IO issues. Because of that, some issues might not have been seen. But very likely, most issues either had an effect on the client side, or the file transfer was not the expected one, e.g. the connection got reset before the end. Still, it is better to fix this. The main consequence of this issue is the error that was reported by the selftests: the received and sent files were different, and the MIB counters were not printed. Also, when such errors happened during the 'disconnect' tests, the program tried to continue until the timeout. Now when an IO error is detected, the program exits directly with an error. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-2-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 4f07ac9fa207..1408698df099 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1093,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1125,7 +1126,7 @@ int main_loop_s(int listensock) SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1134,10 +1135,10 @@ int main_loop_s(int listensock) if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) From 8708c5d8b3fb3f6d5d3b9e6bfe01a505819f519a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:52 +0200 Subject: [PATCH 351/450] selftests: mptcp: avoid spurious errors on TCP disconnect The disconnect test-case, with 'plain' TCP sockets generates spurious errors, e.g. 07 ns1 TCP -> ns1 (dead:beef:1::1:10006) MPTCP read: Connection reset by peer read: Connection reset by peer (duration 155ms) [FAIL] client exit code 3, server 3 netns ns1-FloSdv (listener) socket stat for 10006: TcpActiveOpens 2 0.0 TcpPassiveOpens 2 0.0 TcpEstabResets 2 0.0 TcpInSegs 274 0.0 TcpOutSegs 276 0.0 TcpOutRsts 3 0.0 TcpExtPruneCalled 2 0.0 TcpExtRcvPruned 1 0.0 TcpExtTCPPureAcks 104 0.0 TcpExtTCPRcvCollapsed 2 0.0 TcpExtTCPBacklogCoalesce 42 0.0 TcpExtTCPRcvCoalesce 43 0.0 TcpExtTCPChallengeACK 1 0.0 TcpExtTCPFromZeroWindowAdv 42 0.0 TcpExtTCPToZeroWindowAdv 41 0.0 TcpExtTCPWantZeroWindowAdv 13 0.0 TcpExtTCPOrigDataSent 164 0.0 TcpExtTCPDelivered 165 0.0 TcpExtTCPRcvQDrop 1 0.0 In the failing scenarios (TCP -> MPTCP), the involved sockets are actually plain TCP ones, as fallbacks for passive sockets at 2WHS time cause the MPTCP listeners to actually create 'plain' TCP sockets. Similar to commit 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect"), the root cause is in the user-space bits: the test program tries to disconnect as soon as all the pending data has been spooled, generating an RST. If such option reaches the peer before the connection has reached the closed status, the TCP socket will report an error to the user-space, as per protocol specification, causing the above failure. Note that it looks like this issue got more visible since the "tcp: receiver changes" series from commit 06baf9bfa6ca ("Merge branch 'tcp-receiver-changes'"). Address the issue by explicitly waiting for the TCP sockets (-t) to reach a closed status before performing the disconnect. More precisely, the test program now waits for plain TCP sockets or TCP subflows in addition to the MPTCP sockets that were already monitored. While at it, use 'ss' with '-n' to avoid resolving service names, which is not needed here. Fixes: 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-3-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 1408698df099..b148cadb96d0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1248,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1258,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { From a17c5aa3a32373f80b4714b411bd8d4ffee6fc6a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:53 +0200 Subject: [PATCH 352/450] selftests: mptcp: print trailing bytes with od This is better than printing random bytes in the terminal. Note that Jakub suggested 'hexdump', but Mat found out this tool is not often installed by default. 'od' can do a similar job, and it is in the POSIX specs and available in coreutils, so it should be on more systems. While at it, display a few more bytes, just to fill in the two lines. And no need to display the 3rd only line showing the next number of bytes: 0000040. Suggested-by: Jakub Kicinski Suggested-by: Mat Martineau Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-4-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae46..d62e653d48b0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file From cf74e0aa0eb024741c8b5cb7e839d2824ca77336 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:54 +0200 Subject: [PATCH 353/450] selftests: mptcp: connect: print pcap prefix To be able to find which capture files have been produced after several runs. This prefix was not printed anywhere before. While at it, always use the same prefix by taking info from ns1, instead of "$connector_ns", which is sometimes ns1, sometimes ns2 in the subtests. Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-5-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index c2ab9f7f0d21..47ecb5b3836e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else From 96939cec994070aa5df852c10fad5fc303a97ea3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:20 +0200 Subject: [PATCH 354/450] mptcp: set remote_deny_join_id0 on SYN recv When a SYN containing the 'C' flag (deny join id0) was received, this piece of information was not propagated to the path-manager. Even if this flag is mainly set on the server side, a client can also tell the server it cannot try to establish new subflows to the client's initial IP address and port. The server's PM should then record such info when received, and before sending events about the new connection. Fixes: df377be38725 ("mptcp: add deny_join_id0 in mptcp_options_received") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-1-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..f31a3a79531a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -883,6 +883,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: [PATCH 355/450] mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } From 24733e193a0d68f20d220e86da0362460c9aa812 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:22 +0200 Subject: [PATCH 356/450] selftests: mptcp: userspace pm: validate deny-join-id0 flag The previous commit adds the MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 flag. Make sure it is correctly announced by the other peer when it has been received. pm_nl_ctl will now display 'deny_join_id0:1' when monitoring the events, and when this flag was set by the other peer. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-3-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 7 +++++++ tools/testing/selftests/net/mptcp/userspace_pm.sh | 14 +++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c1..93fea3442216 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 970c329735ff..3d45991f24ed 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi From 92da495cb65719583aa06bc946aeb18a10e1e6e2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:23 +0200 Subject: [PATCH 357/450] mptcp: tfo: record 'deny join id0' info When TFO is used, the check to see if the 'C' flag (deny join id0) was set was bypassed. This flag can be set when TFO is used, so the check should also be done when TFO is used. Note that the set_fully_established label is also used when a 4th ACK is received. In this case, deny_join_id0 will not be set. Fixes: dfc8d0603033 ("mptcp: implement delayed seq generation for passive fastopen") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-4-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a8ea28442b2..1103b3341a70 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); From b86418beade11d45540a2d20c4ec1128849b6c27 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Sep 2025 14:52:24 +0200 Subject: [PATCH 358/450] selftests: mptcp: sockopt: fix error messages This patch fixes several issues in the error reporting of the MPTCP sockopt selftest: 1. Fix diff not printed: The error messages for counter mismatches had the actual difference ('diff') as argument, but it was missing in the format string. Displaying it makes the debugging easier. 2. Fix variable usage: The error check for 'mptcpi_bytes_acked' incorrectly used 'ret2' (sent bytes) for both the expected value and the difference calculation. It now correctly uses 'ret' (received bytes), which is the expected value for bytes_acked. 3. Fix off-by-one in diff: The calculation for the 'mptcpi_rcv_delta' diff was 's.mptcpi_rcv_delta - ret', which is off-by-one. It has been corrected to 's.mptcpi_rcv_delta - (ret + 1)' to match the expected value in the condition above it. Fixes: 5dcff89e1455 ("selftests: mptcp: explicitly tests aggregate counters") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-5-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_sockopt.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index e934dd26a59d..112c07c4c37a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -667,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); From 93ab4881a4e2b9657bdce4b8940073bfb4ed5eab Mon Sep 17 00:00:00 2001 From: Yeounsu Moon Date: Sat, 13 Sep 2025 15:01:36 +0900 Subject: [PATCH 359/450] net: natsemi: fix `rx_dropped` double accounting on `netif_rx()` failure `netif_rx()` already increments `rx_dropped` core stat when it fails. The driver was also updating `ndev->stats.rx_dropped` in the same path. Since both are reported together via `ip -s -s` command, this resulted in drops being counted twice in user-visible stats. Keep the driver update on `if (unlikely(!skb))`, but skip it after `netif_rx()` errors. Fixes: caf586e5f23c ("net: add a core netdev->rx_dropped counter") Signed-off-by: Yeounsu Moon Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250913060135.35282-3-yyyynoom@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/natsemi/ns83820.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 56d5464222d9..cdbf82affa7b 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -820,7 +820,7 @@ static void rx_irq(struct net_device *ndev) struct ns83820 *dev = PRIV(ndev); struct rx_info *info = &dev->rx_info; unsigned next_rx; - int rx_rc, len; + int len; u32 cmdsts; __le32 *desc; unsigned long flags; @@ -881,8 +881,10 @@ static void rx_irq(struct net_device *ndev) if (likely(CMDSTS_OK & cmdsts)) { #endif skb_put(skb, len); - if (unlikely(!skb)) + if (unlikely(!skb)) { + ndev->stats.rx_dropped++; goto netdev_mangle_me_harder_failed; + } if (cmdsts & CMDSTS_DEST_MULTI) ndev->stats.multicast++; ndev->stats.rx_packets++; @@ -901,15 +903,12 @@ static void rx_irq(struct net_device *ndev) __vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag); } #endif - rx_rc = netif_rx(skb); - if (NET_RX_DROP == rx_rc) { -netdev_mangle_me_harder_failed: - ndev->stats.rx_dropped++; - } + netif_rx(skb); } else { dev_kfree_skb_irq(skb); } +netdev_mangle_me_harder_failed: nr++; next_rx = info->next_rx; desc = info->descs + (DESC_SIZE * next_rx); From ce4be9e4307c5a60701ff6e0cafa74caffdc54ce Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 9 Sep 2025 13:48:35 +0900 Subject: [PATCH 360/450] zram: fix slot write race condition Parallel concurrent writes to the same zram index result in leaked zsmalloc handles. Schematically we can have something like this: CPU0 CPU1 zram_slot_lock() zs_free(handle) zram_slot_lock() zram_slot_lock() zs_free(handle) zram_slot_lock() compress compress handle = zs_malloc() handle = zs_malloc() zram_slot_lock zram_set_handle(handle) zram_slot_lock zram_slot_lock zram_set_handle(handle) zram_slot_lock Either CPU0 or CPU1 zsmalloc handle will leak because zs_free() is done too early. In fact, we need to reset zram entry right before we set its new handle, all under the same slot lock scope. Link: https://lkml.kernel.org/r/20250909045150.635345-1-senozhatsky@chromium.org Fixes: 71268035f5d7 ("zram: free slot memory early during write") Signed-off-by: Sergey Senozhatsky Reported-by: Changhui Zhong Closes: https://lore.kernel.org/all/CAGVVp+UtpGoW5WEdEU7uVTtsSCjPN=ksN6EcvyypAtFDOUf30A@mail.gmail.com/ Tested-by: Changhui Zhong Cc: Jens Axboe Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6e..f31652085adc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_SAME); zram_set_handle(zram, index, fill); zram_slot_unlock(zram, index); @@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, kunmap_local(src); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, PAGE_SIZE); @@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) unsigned long element; bool same_filled; - /* First, free memory allocated to this slot (if any) */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_slot_unlock(zram, index); - mem = kmap_local_page(page); same_filled = page_same_filled(mem, &element); kunmap_local(mem); @@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zstrm); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index); From 35e526398bd0faddef3396d71760e4c5ea75868f Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Sat, 23 Aug 2025 20:16:47 +0800 Subject: [PATCH 361/450] drm/i915/backlight: Honor VESA eDP backlight luminance control capability The VESA AUX backlight fails to be enable luminance based backlight mainpulation becaused luminance_set is false by default. Fix it by using luminance support control capabitliy. Fixes: e13af5166a359 ("drm/i915/backlight: Use drm helper to initialize edp backlight") Signed-off-by: Aaron Ma Reviewed-by: Suraj Kandpal Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250823121647.275834-1-aaron.ma@canonical.com (cherry picked from commit 72136efb875d8438c20b9c8ab72945d474933471) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c index 41228478b21c..0a3a3f6a5f9d 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c @@ -546,7 +546,7 @@ static int intel_dp_aux_vesa_setup_backlight(struct intel_connector *connector, luminance_range->max_luminance, panel->vbt.backlight.pwm_freq_hz, intel_dp->edp_dpcd, ¤t_level, ¤t_mode, - false); + panel->backlight.edp.vesa.luminance_control_support); if (ret < 0) return ret; From 1b09d08866277677d11726116f5e786d5ba00173 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Mon, 15 Sep 2025 14:35:46 +0530 Subject: [PATCH 362/450] platform/x86/amd/pmf: Support new ACPI ID AMDI0108 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include the ACPI ID AMDI0108, which is used on upcoming AMD platforms, in the PMF driver's list of supported devices. Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20250915090546.2759130-1-Shyam-sundar.S-k@amd.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmf/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/amd/pmf/core.c b/drivers/platform/x86/amd/pmf/core.c index ef988605c4da..bc544a4a5266 100644 --- a/drivers/platform/x86/amd/pmf/core.c +++ b/drivers/platform/x86/amd/pmf/core.c @@ -403,6 +403,7 @@ static const struct acpi_device_id amd_pmf_acpi_ids[] = { {"AMDI0103", 0}, {"AMDI0105", 0}, {"AMDI0107", 0}, + {"AMDI0108", 0}, { } }; MODULE_DEVICE_TABLE(acpi, amd_pmf_acpi_ids); From 225d1ee0f5ba3218d1814d36564fdb5f37b50474 Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Tue, 16 Sep 2025 09:28:18 +0200 Subject: [PATCH 363/450] platform/x86: asus-wmi: Re-add extra keys to ignore_key_wlan quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that the dual screen models use 0x5E for attaching and detaching the keyboard instead of 0x5F. So, re-add the codes by reverting commit cf3940ac737d ("platform/x86: asus-wmi: Remove extra keys from ignore_key_wlan quirk"). For our future reference, add a comment next to 0x5E indicating that it is used for that purpose. Fixes: cf3940ac737d ("platform/x86: asus-wmi: Remove extra keys from ignore_key_wlan quirk") Reported-by: Rahul Chandra Closes: https://lore.kernel.org/all/10020-68c90c80-d-4ac6c580@106290038/ Cc: stable@kernel.org Signed-off-by: Antheas Kapenekakis Link: https://patch.msgid.link/20250916072818.196462-1-lkml@antheas.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-nb-wmi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 3a488cf9ca06..6a62bc5b02fd 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -673,6 +673,8 @@ static void asus_nb_wmi_key_filter(struct asus_wmi_driver *asus_wmi, int *code, if (atkbd_reports_vol_keys) *code = ASUS_WMI_KEY_IGNORE; break; + case 0x5D: /* Wireless console Toggle */ + case 0x5E: /* Wireless console Enable / Keyboard Attach, Detach */ case 0x5F: /* Wireless console Disable / Special Key */ if (quirks->key_wlan_event) *code = quirks->key_wlan_event; From 288dac9fb6084330d968459c750c838fd06e10e6 Mon Sep 17 00:00:00 2001 From: Qi Xi Date: Thu, 4 Sep 2025 11:44:47 +0800 Subject: [PATCH 364/450] drm: bridge: cdns-mhdp8546: Fix missing mutex unlock on error path Add missing mutex unlock before returning from the error path in cdns_mhdp_atomic_enable(). Fixes: 935a92a1c400 ("drm: bridge: cdns-mhdp8546: Fix possible null pointer dereference") Reported-by: Hulk Robot Signed-off-by: Qi Xi Reviewed-by: Luca Ceresoli Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250904034447.665427-1-xiqi2@huawei.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c index a614d1384f71..38726ae1bf15 100644 --- a/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c +++ b/drivers/gpu/drm/bridge/cadence/cdns-mhdp8546-core.c @@ -1984,8 +1984,10 @@ static void cdns_mhdp_atomic_enable(struct drm_bridge *bridge, mhdp_state = to_cdns_mhdp_bridge_state(new_state); mhdp_state->current_mode = drm_mode_duplicate(bridge->dev, mode); - if (!mhdp_state->current_mode) - return; + if (!mhdp_state->current_mode) { + ret = -EINVAL; + goto out; + } drm_mode_set_name(mhdp_state->current_mode); From cbc7f3b4f6ca19320e2eacf8fc1403d6f331ce14 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 7 Aug 2025 18:53:41 +0300 Subject: [PATCH 365/450] drm/xe: Fix a NULL vs IS_ERR() in xe_vm_add_compute_exec_queue() The xe_preempt_fence_create() function returns error pointers. It never returns NULL. Update the error checking to match. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Dan Carpenter Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/aJTMBdX97cof_009@stanley.mountain Signed-off-by: Rodrigo Vivi (cherry picked from commit 75cc23ffe5b422bc3cbd5cf0956b8b86e4b0e162) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index dc4f61e56579..5146999d27fa 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -240,8 +240,8 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) pfence = xe_preempt_fence_create(q, q->lr.context, ++q->lr.seqno); - if (!pfence) { - err = -ENOMEM; + if (IS_ERR(pfence)) { + err = PTR_ERR(pfence); goto out_fini; } From f0bd03832f5c84f90919bd018156b1b6eb911692 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 10 Sep 2025 19:11:06 +0800 Subject: [PATCH 366/450] md: init queue_limits->max_hw_wzeroes_unmap_sectors parameter The parameter max_hw_wzeroes_unmap_sectors in queue_limits should be equal to max_write_zeroes_sectors if it is set to a non-zero value. However, the stacked md drivers call md_init_stacking_limits() to initialize this parameter to UINT_MAX but only adjust max_write_zeroes_sectors when setting limits. Therefore, this discrepancy triggers a value check failure in blk_validate_limits(). $ modprobe scsi_debug num_parts=2 dev_size_mb=8 lbprz=1 lbpws=1 $ mdadm --create /dev/md0 --level=0 --raid-device=2 /dev/sda1 /dev/sda2 mdadm: Defaulting to version 1.2 metadata mdadm: RUN_ARRAY failed: Invalid argument Fix this failure by explicitly setting max_hw_wzeroes_unmap_sectors to max_write_zeroes_sectors. Since the linear and raid0 drivers support write zeroes, so they can support unmap write zeroes operation if all of the backend devices support it. However, the raid1/10/5 drivers don't support write zeroes, so we have to set it to zero. Fixes: 0c40d7cb5ef3 ("block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits") Reported-by: John Garry Closes: https://lore.kernel.org/linux-block/803a2183-a0bb-4b7a-92f1-afc5097630d2@oracle.com/ Signed-off-by: Zhang Yi Tested-by: John Garry Reviewed-by: Li Nan Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/linux-raid/20250910111107.3247530-2-yi.zhang@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/md-linear.c | 1 + drivers/md/raid0.c | 1 + drivers/md/raid1.c | 1 + drivers/md/raid10.c | 1 + drivers/md/raid5.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b08115375..3e1f165c2d20 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542a..419139ad7663 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bf44878ec640..d30b82beeb92 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3211,6 +3211,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..9832eefb2f15 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4008,6 +4008,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..e385ef1355e8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7732,6 +7732,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, From 0b47b6c3543efd65f2e620e359b05f4938314fbd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 12 Sep 2025 18:14:38 +0200 Subject: [PATCH 367/450] Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scx_bpf_reenqueue_local() can be called from ops.cpu_release() when a CPU is taken by a higher scheduling class to give tasks queued to the CPU's local DSQ a chance to be migrated somewhere else, instead of waiting indefinitely for that CPU to become available again. In doing so, we decided to skip migration-disabled tasks, under the assumption that they cannot be migrated anyway. However, when a higher scheduling class preempts a CPU, the running task is always inserted at the head of the local DSQ as a migration-disabled task. This means it is always skipped by scx_bpf_reenqueue_local(), and ends up being confined to the same CPU even if that CPU is heavily contended by other higher scheduling class tasks. As an example, let's consider the following scenario: $ schedtool -a 0,1, -e yes > /dev/null $ sudo schedtool -F -p 99 -a 0, -e \ stress-ng -c 1 --cpu-load 99 --cpu-load-slice 1000 The first task (SCHED_EXT) can run on CPU0 or CPU1. The second task (SCHED_FIFO) is pinned to CPU0 and consumes ~99% of it. If the SCHED_EXT task initially runs on CPU0, it will remain there because it always sees CPU0 as "idle" in the short gaps left by the RT task, resulting in ~1% utilization while CPU1 stays idle: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[ 0.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 1067 root RT 0 R 0 99.0 0.2 0:31.16 stress-ng-cpu [run] 975 arighi 20 0 R 0 1.0 0.0 0:26.32 yes By allowing scx_bpf_reenqueue_local() to re-enqueue migration-disabled tasks, the scheduler can choose to migrate them to other CPUs (CPU1 in this case) via ops.enqueue(), leading to better CPU utilization: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[||||||||||||||||||||||100.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 577 root RT 0 R 0 100.0 0.2 0:23.17 stress-ng-cpu [run] 555 arighi 20 0 R 1 100.0 0.0 0:28.67 yes It's debatable whether per-CPU tasks should be re-enqueued as well, but doing so is probably safer: the scheduler can recognize re-enqueued tasks through the %SCX_ENQ_REENQ flag, reassess their placement, and either put them back at the head of the local DSQ or let another task attempt to take the CPU. This also prevents giving per-CPU tasks an implicit priority boost, which would otherwise make them more likely to reclaim CPUs preempted by higher scheduling classes. Fixes: 97e13ecb02668 ("sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..088ceff38c8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); From 84bf1ac85af84d354c7a2fdbdc0d4efc8aaec34b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 25 Aug 2025 16:00:14 -0700 Subject: [PATCH 368/450] ice: fix Rx page leak on multi-buffer frames The ice_put_rx_mbuf() function handles calling ice_put_rx_buf() for each buffer in the current frame. This function was introduced as part of handling multi-buffer XDP support in the ice driver. It works by iterating over the buffers from first_desc up to 1 plus the total number of fragments in the frame, cached from before the XDP program was executed. If the hardware posts a descriptor with a size of 0, the logic used in ice_put_rx_mbuf() breaks. Such descriptors get skipped and don't get added as fragments in ice_add_xdp_frag. Since the buffer isn't counted as a fragment, we do not iterate over it in ice_put_rx_mbuf(), and thus we don't call ice_put_rx_buf(). Because we don't call ice_put_rx_buf(), we don't attempt to re-use the page or free it. This leaves a stale page in the ring, as we don't increment next_to_alloc. The ice_reuse_rx_page() assumes that the next_to_alloc has been incremented properly, and that it always points to a buffer with a NULL page. Since this function doesn't check, it will happily recycle a page over the top of the next_to_alloc buffer, losing track of the old page. Note that this leak only occurs for multi-buffer frames. The ice_put_rx_mbuf() function always handles at least one buffer, so a single-buffer frame will always get handled correctly. It is not clear precisely why the hardware hands us descriptors with a size of 0 sometimes, but it happens somewhat regularly with "jumbo frames" used by 9K MTU. To fix ice_put_rx_mbuf(), we need to make sure to call ice_put_rx_buf() on all buffers between first_desc and next_to_clean. Borrow the logic of a similar function in i40e used for this same purpose. Use the same logic also in ice_get_pgcnts(). Instead of iterating over just the number of fragments, use a loop which iterates until the current index reaches to the next_to_clean element just past the current frame. Unlike i40e, the ice_put_rx_mbuf() function does call ice_put_rx_buf() on the last buffer of the frame indicating the end of packet. For non-linear (multi-buffer) frames, we need to take care when adjusting the pagecnt_bias. An XDP program might release fragments from the tail of the frame, in which case that fragment page is already released. Only update the pagecnt_bias for the first descriptor and fragments still remaining post-XDP program. Take care to only access the shared info for fragmented buffers, as this avoids a significant cache miss. The xdp_xmit value only needs to be updated if an XDP program is run, and only once per packet. Drop the xdp_xmit pointer argument from ice_put_rx_mbuf(). Instead, set xdp_xmit in the ice_clean_rx_irq() function directly. This avoids needing to pass the argument and avoids an extra bit-wise OR for each buffer in the frame. Move the increment of the ntc local variable to ensure its updated *before* all calls to ice_get_pgcnts() or ice_put_rx_mbuf(), as the loop logic requires the index of the element just after the current frame. Now that we use an index pointer in the ring to identify the packet, we no longer need to track or cache the number of fragments in the rx_ring. Cc: Christoph Petrausch Cc: Jesper Dangaard Brouer Reported-by: Jaroslav Pulchart Closes: https://lore.kernel.org/netdev/CAK8fFZ4hY6GUJNENz3wY9jaYLZXGfpr7dnZxzGMYoE44caRbgw@mail.gmail.com/ Fixes: 743bbd93cf29 ("ice: put Rx buffers after being done with current frame") Tested-by: Michal Kubiak Signed-off-by: Jacob Keller Acked-by: Jesper Dangaard Brouer Tested-by: Priya Singh Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 80 ++++++++++------------- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - 2 files changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec94..41e7e29879a3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -894,10 +894,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); sinfo->xdp_frags_size += size; - /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() - * can pop off frags but driver has to handle it on its own - */ - rx_ring->nr_frags = sinfo->nr_frags; if (page_is_pfmemalloc(rx_buf->page)) xdp_buff_set_frag_pfmemalloc(xdp); @@ -968,20 +964,20 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, /** * ice_get_pgcnts - grab page_count() for gathered fragments * @rx_ring: Rx descriptor ring to store the page counts on + * @ntc: the next to clean element (not included in this frame!) * * This function is intended to be called right before running XDP * program so that the page recycling mechanism will be able to take * a correct decision regarding underlying pages; this is done in such * way as XDP program can change the refcount of page */ -static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring, unsigned int ntc) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; struct ice_rx_buf *rx_buf; u32 cnt = rx_ring->count; - for (int i = 0; i < nr_frags; i++) { + while (idx != ntc) { rx_buf = &rx_ring->rx_buf[idx]; rx_buf->pgcnt = page_count(rx_buf->page); @@ -1154,62 +1150,51 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) } /** - * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all buffers in frame * @rx_ring: Rx ring with all the auxiliary data * @xdp: XDP buffer carrying linear + frags part - * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage - * @ntc: a current next_to_clean value to be stored at rx_ring + * @ntc: the next to clean element (not included in this frame!) * @verdict: return code from XDP program execution * - * Walk through gathered fragments and satisfy internal page - * recycle mechanism; we take here an action related to verdict - * returned by XDP program; + * Called after XDP program is completed, or on error with verdict set to + * ICE_XDP_CONSUMED. + * + * Walk through buffers from first_desc to the end of the frame, releasing + * buffers and satisfying internal page recycle mechanism. The action depends + * on verdict from XDP program. */ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - u32 *xdp_xmit, u32 ntc, u32 verdict) + u32 ntc, u32 verdict) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; - u32 post_xdp_frags = 1; struct ice_rx_buf *buf; - int i; + u32 xdp_frags = 0; + int i = 0; if (unlikely(xdp_buff_has_frags(xdp))) - post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + xdp_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - for (i = 0; i < post_xdp_frags; i++) { + while (idx != ntc) { buf = &rx_ring->rx_buf[idx]; + if (++idx == cnt) + idx = 0; - if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { + /* An XDP program could release fragments from the end of the + * buffer. For these, we need to keep the pagecnt_bias as-is. + * To do this, only adjust pagecnt_bias for fragments up to + * the total remaining after the XDP program has run. + */ + if (verdict != ICE_XDP_CONSUMED) ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - *xdp_xmit |= verdict; - } else if (verdict & ICE_XDP_CONSUMED) { + else if (i++ <= xdp_frags) buf->pagecnt_bias++; - } else if (verdict == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } ice_put_rx_buf(rx_ring, buf); - - if (++idx == cnt) - idx = 0; - } - /* handle buffers that represented frags released by XDP prog; - * for these we keep pagecnt_bias as-is; refcount from struct page - * has been decremented within XDP prog and we do not have to increase - * the biased refcnt - */ - for (; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - ice_put_rx_buf(rx_ring, buf); - if (++idx == cnt) - idx = 0; } xdp->data = NULL; rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; } /** @@ -1317,6 +1302,10 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ rx_buf = ice_get_rx_buf(rx_ring, size, ntc); + /* Increment ntc before calls to ice_put_rx_mbuf() */ + if (++ntc == cnt) + ntc = 0; + if (!xdp->data) { void *hard_start; @@ -1325,24 +1314,23 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { - ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); + ice_put_rx_mbuf(rx_ring, xdp, ntc, ICE_XDP_CONSUMED); break; } - if (++ntc == cnt) - ntc = 0; /* skip if it is NOP desc */ if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_get_pgcnts(rx_ring); + ice_get_pgcnts(rx_ring, ntc); xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); + xdp_xmit |= xdp_verdict & (ICE_XDP_TX | ICE_XDP_REDIR); continue; construct_skb: @@ -1355,7 +1343,7 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) rx_ring->ring_stats->rx_stats.alloc_buf_failed++; xdp_verdict = ICE_XDP_CONSUMED; } - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); if (!skb) break; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index fef750c5f288..2fd8e78178a2 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -358,7 +358,6 @@ struct ice_rx_ring { struct ice_tx_ring *xdp_ring; struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; - u32 nr_frags; u16 max_frame; u16 rx_buf_len; dma_addr_t dma; /* physical address of ring */ From e37084a26070c546ae7961ee135bbfb15fbe13fd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 22 Aug 2025 17:16:17 +0200 Subject: [PATCH 369/450] i40e: remove redundant memory barrier when cleaning Tx descs i40e has a feature which writes to memory location last descriptor successfully sent. Memory barrier in i40e_clean_tx_irq() was used to avoid forward-reading descriptor fields in case DD bit was not set. Having mentioned feature in place implies that such situation will not happen as we know in advance how many descriptors HW has dealt with. Besides, this barrier placement was wrong. Idea is to have this protection *after* reading DD bit from HW descriptor, not before. Digging through git history showed me that indeed barrier was before DD bit check, anyways the commit introducing i40e_get_head() should have wiped it out altogether. Also, there was one commit doing s/read_barrier_depends/smp_rmb when get head feature was already in place, but it was only theoretical based on ixgbe experiences, which is different in these terms as that driver has to read DD bit from HW descriptor. Fixes: 1943d8ba9507 ("i40e/i40evf: enable hardware feature head write back") Signed-off-by: Maciej Fijalkowski Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c33039130..b194eae03208 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -948,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, if (!eop_desc) break; - /* prevent any other reads prior to eop_desc */ - smp_rmb(); - i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) From b85936e95a4bd2a07e134af71e2c0750a69d2b8b Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:28 +0200 Subject: [PATCH 370/450] ixgbe: initialize aci.lock before it's used Currently aci.lock is initialized too late. A bunch of ACI callbacks using the lock are called prior it's initialized. Commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path") highlights that issue what results in call trace. [ 4.092899] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 4.092910] WARNING: CPU: 0 PID: 578 at kernel/locking/mutex.c:154 mutex_lock+0x6d/0x80 [ 4.098757] Call Trace: [ 4.098847] [ 4.098922] ixgbe_aci_send_cmd+0x8c/0x1e0 [ixgbe] [ 4.099108] ? hrtimer_try_to_cancel+0x18/0x110 [ 4.099277] ixgbe_aci_get_fw_ver+0x52/0xa0 [ixgbe] [ 4.099460] ixgbe_check_fw_error+0x1fc/0x2f0 [ixgbe] [ 4.099650] ? usleep_range_state+0x69/0xd0 [ 4.099811] ? usleep_range_state+0x8c/0xd0 [ 4.099964] ixgbe_probe+0x3b0/0x12d0 [ixgbe] [ 4.100132] local_pci_probe+0x43/0xa0 [ 4.100267] work_for_cpu_fn+0x13/0x20 [ 4.101647] Move aci.lock mutex initialization to ixgbe_sw_init() before any ACI command is sent. Along with that move also related SWFW semaphore in order to reduce size of ixgbe_probe() and that way all locks are initialized in ixgbe_sw_init(). Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80e6a2ef1350..9a6a67a6d644 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6973,6 +6973,13 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, break; } + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (hw->mac.type == ixgbe_mac_e610) + mutex_init(&hw->aci.lock); + #ifdef IXGBE_FCOE /* FCoE support exists, always init the FCoE lock */ spin_lock_init(&adapter->fcoe.lock); @@ -11643,10 +11650,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - if (ixgbe_check_fw_error(adapter)) return ixgbe_recovery_probe(adapter); @@ -11850,8 +11853,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ether_addr_copy(hw->mac.addr, hw->mac.perm_addr); ixgbe_mac_set_default_filter(adapter); - if (hw->mac.type == ixgbe_mac_e610) - mutex_init(&hw->aci.lock); timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); if (ixgbe_removed(hw->hw_addr)) { @@ -12007,9 +12008,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); +err_sw_init: if (hw->mac.type == ixgbe_mac_e610) mutex_destroy(&adapter->hw.aci.lock); -err_sw_init: ixgbe_disable_sriov(adapter); adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP; iounmap(adapter->io_addr); From 316ba68175b04a9f6f75295764789ea94e31d48c Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:29 +0200 Subject: [PATCH 371/450] ixgbe: destroy aci.lock later within ixgbe_remove path There's another issue with aci.lock and previous patch uncovers it. aci.lock is being destroyed during removing ixgbe while some of the ixgbe closing routines are still ongoing. These routines use Admin Command Interface which require taking aci.lock which has been already destroyed what leads to call trace. [ +0.000004] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ +0.000007] WARNING: CPU: 12 PID: 10277 at kernel/locking/mutex.c:155 mutex_lock+0x5f/0x70 [ +0.000002] Call Trace: [ +0.000003] [ +0.000006] ixgbe_aci_send_cmd+0xc8/0x220 [ixgbe] [ +0.000049] ? try_to_wake_up+0x29d/0x5d0 [ +0.000009] ixgbe_disable_rx_e610+0xc4/0x110 [ixgbe] [ +0.000032] ixgbe_disable_rx+0x3d/0x200 [ixgbe] [ +0.000027] ixgbe_down+0x102/0x3b0 [ixgbe] [ +0.000031] ixgbe_close_suspend+0x28/0x90 [ixgbe] [ +0.000028] ixgbe_close+0xfb/0x100 [ixgbe] [ +0.000025] __dev_close_many+0xae/0x220 [ +0.000005] dev_close_many+0xc2/0x1a0 [ +0.000004] ? kernfs_should_drain_open_files+0x2a/0x40 [ +0.000005] unregister_netdevice_many_notify+0x204/0xb00 [ +0.000006] ? __kernfs_remove.part.0+0x109/0x210 [ +0.000006] ? kobj_kset_leave+0x4b/0x70 [ +0.000008] unregister_netdevice_queue+0xf6/0x130 [ +0.000006] unregister_netdev+0x1c/0x40 [ +0.000005] ixgbe_remove+0x216/0x290 [ixgbe] [ +0.000021] pci_device_remove+0x42/0xb0 [ +0.000007] device_release_driver_internal+0x19c/0x200 [ +0.000008] driver_detach+0x48/0x90 [ +0.000003] bus_remove_driver+0x6d/0xf0 [ +0.000006] pci_unregister_driver+0x2e/0xb0 [ +0.000005] ixgbe_exit_module+0x1c/0xc80 [ixgbe] Same as for the previous commit, the issue has been highlighted by the commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path"). Move destroying aci.lock to the end of ixgbe_remove(), as this simply fixes the issue. Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Reviewed-by: Przemek Kitszel Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9a6a67a6d644..6218bdb7f941 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -12061,10 +12061,8 @@ static void ixgbe_remove(struct pci_dev *pdev) set_bit(__IXGBE_REMOVING, &adapter->state); cancel_work_sync(&adapter->service_task); - if (adapter->hw.mac.type == ixgbe_mac_e610) { + if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_disable_link_status_events(adapter); - mutex_destroy(&adapter->hw.aci.lock); - } if (adapter->mii_bus) mdiobus_unregister(adapter->mii_bus); @@ -12124,6 +12122,9 @@ static void ixgbe_remove(struct pci_dev *pdev) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); + if (adapter->hw.mac.type == ixgbe_mac_e610) + mutex_destroy(&adapter->hw.aci.lock); + if (disable_dev) pci_disable_device(pdev); } From 528eb4e19ec0df30d0c9ae4074ce945667dde919 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Wed, 10 Sep 2025 22:47:21 +0900 Subject: [PATCH 372/450] igc: don't fail igc_probe() on LED setup error When igc_led_setup() fails, igc_probe() fails and triggers kernel panic in free_netdev() since unregister_netdev() is not called. [1] This behavior can be tested using fault-injection framework, especially the failslab feature. [2] Since LED support is not mandatory, treat LED setup failures as non-fatal and continue probe with a warning message, consequently avoiding the kernel panic. [1] kernel BUG at net/core/dev.c:12047! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 937 Comm: repro-igc-led-e Not tainted 6.17.0-rc4-enjuk-tnguy-00865-gc4940196ab02 #64 PREEMPT(voluntary) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:free_netdev+0x278/0x2b0 [...] Call Trace: igc_probe+0x370/0x910 local_pci_probe+0x3a/0x80 pci_device_probe+0xd1/0x200 [...] [2] #!/bin/bash -ex FAILSLAB_PATH=/sys/kernel/debug/failslab/ DEVICE=0000:00:05.0 START_ADDR=$(grep " igc_led_setup" /proc/kallsyms \ | awk '{printf("0x%s", $1)}') END_ADDR=$(printf "0x%x" $((START_ADDR + 0x100))) echo $START_ADDR > $FAILSLAB_PATH/require-start echo $END_ADDR > $FAILSLAB_PATH/require-end echo 1 > $FAILSLAB_PATH/times echo 100 > $FAILSLAB_PATH/probability echo N > $FAILSLAB_PATH/ignore-gfp-wait echo $DEVICE > /sys/bus/pci/drivers/igc/bind Fixes: ea578703b03d ("igc: Add support for LEDs on i225/i226") Signed-off-by: Kohei Enju Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Reviewed-by: Vitaly Lifshits Reviewed-by: Kurt Kanzenbach Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 266bfcf2a28f..a427f05814c1 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -345,6 +345,7 @@ struct igc_adapter { /* LEDs */ struct mutex led_mutex; struct igc_led_classdev *leds; + bool leds_available; }; void igc_up(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e79b14d50b24..728d7ca5338b 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7335,8 +7335,14 @@ static int igc_probe(struct pci_dev *pdev, if (IS_ENABLED(CONFIG_IGC_LEDS)) { err = igc_led_setup(adapter); - if (err) - goto err_register; + if (err) { + netdev_warn_once(netdev, + "LED init failed (%d); continuing without LED support\n", + err); + adapter->leds_available = false; + } else { + adapter->leds_available = true; + } } return 0; @@ -7392,7 +7398,7 @@ static void igc_remove(struct pci_dev *pdev) cancel_work_sync(&adapter->watchdog_task); hrtimer_cancel(&adapter->hrtimer); - if (IS_ENABLED(CONFIG_IGC_LEDS)) + if (IS_ENABLED(CONFIG_IGC_LEDS) && adapter->leds_available) igc_led_free(adapter); /* Release control of h/w to f/w. If f/w is AMT enabled, this From f9b80514a7227c589291792cb6743b0ddf41c2bc Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 15 Sep 2025 20:59:02 -0500 Subject: [PATCH 373/450] drm/amd: Only restore cached manual clock settings in restore if OD enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If OD is not enabled then restoring cached clock settings doesn't make sense and actually leads to errors in resume. Check if enabled before restoring settings. Fixes: 4e9526924d09 ("drm/amd: Restore cached manual clock settings during resume") Reported-by: Jérôme Lécuyer Closes: https://lore.kernel.org/amd-gfx/0ffe2692-7bfa-4821-856e-dd0f18e2c32b@amd.com/T/#me6db8ddb192626360c462b7570ed7eba0c6c9733 Suggested-by: Jérôme Lécuyer Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 1a4dd33cc6e1baaa81efdbe68227a19f51c50f20) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index b47cb4a5f488..408f05dfab90 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -2236,7 +2236,7 @@ static int smu_resume(struct amdgpu_ip_block *ip_block) return ret; } - if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL) { + if (smu_dpm_ctx->dpm_level == AMD_DPM_FORCED_LEVEL_MANUAL && smu->od_enabled) { ret = smu_od_edit_dpm_table(smu, PP_OD_COMMIT_DPM_TABLE, NULL, 0); if (ret) return ret; From 109f8b51543d106aee50dfe911f439e43fb30c7a Mon Sep 17 00:00:00 2001 From: "Remy D. Farley" Date: Sat, 13 Sep 2025 14:05:28 +0000 Subject: [PATCH 374/450] doc/netlink: Fix typos in operation attributes I'm trying to generate Rust bindings for netlink using the yaml spec. It looks like there's a typo in conntrack spec: attribute set conntrack-attrs defines attributes "counters-{orig,reply}" (plural), while get operation references "counter-{orig,reply}" (singular). The latter should be fixed, as it denotes multiple counters (packet and byte). The corresonding C define is CTA_COUNTERS_ORIG. Also, dump request references "nfgen-family" attribute, which neither exists in conntrack-attrs attrset nor ctattr_type enum. There's member of nfgenmsg struct with the same name, which is where family value is actually taken from. > static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, > struct sk_buff *skb, > const struct nlmsghdr *nlh, > const struct nlattr * const cda[], > struct netlink_ext_ack *extack) > { > int err; > struct nfgenmsg *nfmsg = nlmsg_data(nlh); > u_int8_t u3 = nfmsg->nfgen_family; ^^^^^^^^^^^^ Signed-off-by: Remy D. Farley Fixes: 23fc9311a526 ("netlink: specs: add conntrack dump and stats dump support") Link: https://patch.msgid.link/20250913140515.1132886-1-one-d-wide@protonmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/conntrack.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/netlink/specs/conntrack.yaml b/Documentation/netlink/specs/conntrack.yaml index c6832633ab7b..591e22a2ee43 100644 --- a/Documentation/netlink/specs/conntrack.yaml +++ b/Documentation/netlink/specs/conntrack.yaml @@ -575,8 +575,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst @@ -591,7 +591,6 @@ operations: request: value: 0x101 attributes: - - nfgen-family - mark - filter - status @@ -608,8 +607,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst From 94ff1ed3030e88cfe4e34c1d47c5832995c953c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Sep 2025 16:42:55 -0700 Subject: [PATCH 375/450] MAINTAINERS: make the DPLL entry cover drivers DPLL maintainers should probably be CCed on driver patches, too. Remove the *, which makes the pattern only match files directly under drivers/dpll but not its sub-directories. Acked-by: Jiri Pirko Acked-by: Vadim Fedorenko Acked-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250915234255.1306612-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea95802d87e1..086b4a7bd04f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7430,7 +7430,7 @@ S: Supported F: Documentation/devicetree/bindings/dpll/dpll-device.yaml F: Documentation/devicetree/bindings/dpll/dpll-pin.yaml F: Documentation/driver-api/dpll.rst -F: drivers/dpll/* +F: drivers/dpll/ F: include/linux/dpll.h F: include/uapi/linux/dpll.h From 6b4be64fd9fec16418f365c2d8e47a7566e9eba5 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 15 Sep 2025 15:24:32 +0300 Subject: [PATCH 376/450] net/mlx5e: Harden uplink netdev access against device unbind The function mlx5_uplink_netdev_get() gets the uplink netdevice pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can be removed and its pointer cleared when unbound from the mlx5_core.eth driver. This results in a NULL pointer, causing a kernel panic. BUG: unable to handle page fault for address: 0000000000001300 at RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core] Call Trace: mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core] esw_offloads_enable+0x593/0x910 [mlx5_core] mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core] mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core] devlink_nl_eswitch_set_doit+0x60/0xd0 genl_family_rcv_msg_doit+0xe0/0x130 genl_rcv_msg+0x183/0x290 netlink_rcv_skb+0x4b/0xf0 genl_rcv+0x24/0x40 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 __sys_sendto+0x119/0x180 do_syscall_64+0x53/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Ensure the pointer is valid before use by checking it for NULL. If it is valid, immediately call netdev_hold() to take a reference, and preventing the netdevice from being freed while it is in use. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 27 +++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 ++++++++++- include/linux/mlx5/driver.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..cd0242eb008c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b4977650183..5f2d6c35f1ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b02..74ea5da58b7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..10fe492e1fed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; From 7601a0a46216f4ba05adff2de75923b4e8e585c2 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 15 Sep 2025 15:24:34 +0300 Subject: [PATCH 377/450] net/mlx5e: Add a miss level for ipsec crypto offload The cited commit adds a miss table for switchdev mode. But it uses the same level as policy table. Will hit the following error when running command: # ip xfrm state add src 192.168.1.22 dst 192.168.1.21 proto \ esp spi 1001 reqid 10001 aead 'rfc4106(gcm(aes))' \ 0x3a189a7f9374955d3817886c8587f1da3df387ff 128 \ mode tunnel offload dev enp8s0f0 dir in Error: mlx5_core: Device failed to offload this state. The dmesg error is: mlx5_core 0000:03:00.0: ipsec_miss_create:578:(pid 311797): fail to create IPsec miss_rule err=-22 Fix it by adding a new miss level to avoid the error. Fixes: 7d9e292ecd67 ("net/mlx5e: Move IPSec policy check after decryption") Signed-off-by: Jianbo Liu Signed-off-by: Chris Mi Signed-off-by: Lama Kayal Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/fs.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 9560fcba643f..ac65e3191480 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -92,6 +92,7 @@ enum { MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_ACCEL_FS_POL_FT_LEVEL, + MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index ffcd0cdeb775..23703f28386a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr { u32 family; int prio; int pol_level; + int pol_miss_level; int sa_level; int status_level; enum mlx5_flow_namespace_type chains_ns; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 98b6a3a623f9..65dc3529283b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -747,6 +747,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec, attr->family = family; attr->prio = MLX5E_NIC_PRIO; attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL; + attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL; attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL; @@ -833,7 +834,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, ft_attr.max_fte = 1; ft_attr.autogroup.max_num_groups = 1; - ft_attr.level = attr->pol_level; + ft_attr.level = attr->pol_miss_level; ft_attr.prio = attr->prio; ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index cb165085a4c1..db552c012b4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -114,9 +114,9 @@ #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) /* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, - * {IPsec RoCE MPV,Alias table},IPsec RoCE policy + * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy */ -#define KERNEL_NIC_PRIO_NUM_LEVELS 10 +#define KERNEL_NIC_PRIO_NUM_LEVELS 11 #define KERNEL_NIC_NUM_PRIOS 1 /* One more level for tc, and one more for promisc */ #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2) From a1eab4d813f7b6e606ed21381b8cfda5c59a87e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Sep 2025 11:06:42 -1000 Subject: [PATCH 378/450] sched_ext, sched/core: Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED While collecting SCX related fields in struct task_group into struct scx_task_group, 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") forgot update tg->scx_weight usage in tg_weight(), which leads to build failure when CONFIG_FAIR_GROUP_SCHED is disabled but CONFIG_EXT_GROUP_SCHED is enabled. Fix it. Fixes: 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509170230.MwZsJSWa-lkp@intel.com/ Tested-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ccba6fc3c3fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } From b6f56a44e4c1014b08859dcf04ed246500e310e5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Sep 2025 13:35:15 +0200 Subject: [PATCH 379/450] net: rfkill: gpio: Fix crash due to dereferencering uninitialized pointer Since commit 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") rfkill_find_type() gets called with the possibly uninitialized "const char *type_name;" local variable. On x86 systems when rfkill-gpio binds to a "BCM4752" or "LNV4752" acpi_device, the rfkill->type is set based on the ACPI acpi_device_id: rfkill->type = (unsigned)id->driver_data; and there is no "type" property so device_property_read_string() will fail and leave type_name uninitialized, leading to a potential crash. rfkill_find_type() does accept a NULL pointer, fix the potential crash by initializing type_name to NULL. Note likely sofar this has not been caught because: 1. Not many x86 machines actually have a "BCM4752"/"LNV4752" acpi_device 2. The stack happened to contain NULL where type_name is stored Fixes: 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") Cc: stable@vger.kernel.org Cc: Heikki Krogerus Signed-off-by: Hans de Goede Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20250913113515.21698-1-hansg@kernel.org Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e97761..cf2dcec6ce5a 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) From 1dd28fd86c3fa4e395031dd6f2ba920242107010 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 17 Sep 2025 08:11:43 +0000 Subject: [PATCH 380/450] ASoC: rt5682s: Adjust SAR ADC button mode to fix noise issue Adjust register settings for SAR adc button detection mode to fix noise issue in headset. Signed-off-by: Jack Yu Link: https://patch.msgid.link/766cd1d2dd7a403ba65bb4cc44845f71@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682s.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sound/soc/codecs/rt5682s.c b/sound/soc/codecs/rt5682s.c index 80b921695e7d..1d80a4b862e2 100644 --- a/sound/soc/codecs/rt5682s.c +++ b/sound/soc/codecs/rt5682s.c @@ -653,14 +653,15 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode switch (mode) { case SAR_PWR_SAVING: snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_3, - RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_DIS); + RT5682S_CBJ_IN_BUF_MASK, RT5682S_CBJ_IN_BUF_EN); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK, - RT5682S_CTRL_MB1_REG | RT5682S_CTRL_MB2_REG); + RT5682S_MB1_PATH_MASK | RT5682S_MB2_PATH_MASK | + RT5682S_VREF_POW_MASK, RT5682S_CTRL_MB1_FSM | + RT5682S_CTRL_MB2_FSM | RT5682S_VREF_POW_FSM); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); usleep_range(5000, 5500); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK, RT5682S_SAR_BUTDET_EN); @@ -688,7 +689,7 @@ static void rt5682s_sar_power_mode(struct snd_soc_component *component, int mode snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); break; default: dev_err(component->dev, "Invalid SAR Power mode: %d\n", mode); @@ -725,7 +726,7 @@ static void rt5682s_disable_push_button_irq(struct snd_soc_component *component) snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_BUTDET_MASK | RT5682S_SAR_BUTDET_POW_MASK | RT5682S_SAR_SEL_MB1_2_CTL_MASK, RT5682S_SAR_BUTDET_DIS | - RT5682S_SAR_BUTDET_POW_SAV | RT5682S_SAR_SEL_MB1_2_MANU); + RT5682S_SAR_BUTDET_POW_NORM | RT5682S_SAR_SEL_MB1_2_MANU); } /** @@ -786,7 +787,7 @@ static int rt5682s_headset_detect(struct snd_soc_component *component, int jack_ jack_type = SND_JACK_HEADSET; snd_soc_component_write(component, RT5682S_SAR_IL_CMD_3, 0x024c); snd_soc_component_update_bits(component, RT5682S_CBJ_CTRL_1, - RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_EN); + RT5682S_FAST_OFF_MASK, RT5682S_FAST_OFF_DIS); snd_soc_component_update_bits(component, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_SEL_MB1_2_MASK, val << RT5682S_SAR_SEL_MB1_2_SFT); rt5682s_enable_push_button_irq(component); @@ -966,7 +967,7 @@ static int rt5682s_set_jack_detect(struct snd_soc_component *component, RT5682S_EMB_JD_MASK | RT5682S_DET_TYPE | RT5682S_POL_FAST_OFF_MASK | RT5682S_MIC_CAP_MASK, RT5682S_EMB_JD_EN | RT5682S_DET_TYPE | - RT5682S_POL_FAST_OFF_HIGH | RT5682S_MIC_CAP_HS); + RT5682S_POL_FAST_OFF_LOW | RT5682S_MIC_CAP_HS); regmap_update_bits(rt5682s->regmap, RT5682S_SAR_IL_CMD_1, RT5682S_SAR_POW_MASK, RT5682S_SAR_POW_EN); regmap_update_bits(rt5682s->regmap, RT5682S_GPIO_CTRL_1, From 44499ecb4f2817743c37d861bdb3e95f37d3d9cd Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 17 Sep 2025 15:09:01 +0200 Subject: [PATCH 381/450] ALSA: usb: qcom: Fix false-positive address space check The sanity check previously added to uaudio_transfer_buffer_setup() assumed the allocated buffer being linear-mapped. But the buffer allocated via usb_alloc_coherent() isn't always so, rather to be used with (SG-)DMA API. This leaded to a false-positive warning and the driver failed to work. Actually uaudio_transfer_buffer_setup() deals only with the DMA-API addresses for MEM_XFER_BUF type, while other callers of uaudio_iommu_map() are with pages with physical addresses for MEM_EVENT_RING and MEM_XFER_RING types. So this patch splits the mapping helper function to two different ones, uaudio_iommu_map() for the DMA pages and uaudio_iommu_map_pa() for the latter, in order to handle mapping differently for each type. Along with it, the unnecessary address check that caused probe error is dropped, too. Fixes: 3335a1bbd624 ("ALSA: qc_audio_offload: try to reduce address space confusion") Suggested-by: Arnd Bergmann Acked-by: Arnd Bergmann Reported-and-tested-by: Luca Weiss Closes: https://lore.kernel.org/DBR2363A95M1.L9XBNC003490@fairphone.com Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 92 ++++++++++++++++--------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index a25c5a531690..9ad76fff741b 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -538,38 +538,33 @@ static void uaudio_iommu_unmap(enum mem_type mtype, unsigned long iova, umap_size, iova, mapped_iova_size); } +static int uaudio_iommu_map_prot(bool dma_coherent) +{ + int prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + return prot; +} + /** - * uaudio_iommu_map() - maps iommu memory for adsp + * uaudio_iommu_map_pa() - maps iommu memory for adsp * @mtype: ring type * @dma_coherent: dma coherent * @pa: physical address for ring/buffer * @size: size of memory region - * @sgt: sg table for memory region * * Maps the XHCI related resources to a memory region that is assigned to be * used by the adsp. This will be mapped to the domain, which is created by * the ASoC USB backend driver. * */ -static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, - phys_addr_t pa, size_t size, - struct sg_table *sgt) +static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, + phys_addr_t pa, size_t size) { - struct scatterlist *sg; unsigned long iova = 0; - size_t total_len = 0; - unsigned long iova_sg; - phys_addr_t pa_sg; bool map = true; - size_t sg_len; - int prot; - int ret; - int i; - - prot = IOMMU_READ | IOMMU_WRITE; - - if (dma_coherent) - prot |= IOMMU_CACHE; + int prot = uaudio_iommu_map_prot(dma_coherent); switch (mtype) { case MEM_EVENT_RING: @@ -583,20 +578,41 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, &uaudio_qdev->xfer_ring_iova_size, &uaudio_qdev->xfer_ring_list, size); break; - case MEM_XFER_BUF: - iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, - &uaudio_qdev->xfer_buf_iova_size, - &uaudio_qdev->xfer_buf_list, size); - break; default: dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype); } if (!iova || !map) - goto done; + return 0; - if (!sgt) - goto skip_sgt_map; + iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); + + return iova; +} + +static unsigned long uaudio_iommu_map_xfer_buf(bool dma_coherent, size_t size, + struct sg_table *sgt) +{ + struct scatterlist *sg; + unsigned long iova = 0; + size_t total_len = 0; + unsigned long iova_sg; + phys_addr_t pa_sg; + size_t sg_len; + int prot = uaudio_iommu_map_prot(dma_coherent); + int ret; + int i; + + prot = IOMMU_READ | IOMMU_WRITE; + + if (dma_coherent) + prot |= IOMMU_CACHE; + + iova = uaudio_get_iova(&uaudio_qdev->curr_xfer_buf_iova, + &uaudio_qdev->xfer_buf_iova_size, + &uaudio_qdev->xfer_buf_list, size); + if (!iova) + goto done; iova_sg = iova; for_each_sg(sgt->sgl, sg, sgt->nents, i) { @@ -618,11 +634,6 @@ static unsigned long uaudio_iommu_map(enum mem_type mtype, bool dma_coherent, uaudio_iommu_unmap(MEM_XFER_BUF, iova, size, total_len); iova = 0; } - return iova; - -skip_sgt_map: - iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); - done: return iova; } @@ -1020,7 +1031,6 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, struct sg_table xfer_buf_sgt; dma_addr_t xfer_buf_dma; void *xfer_buf; - phys_addr_t xfer_buf_pa; u32 len = xfer_buf_len; bool dma_coherent; dma_addr_t xfer_buf_dma_sysdev; @@ -1051,18 +1061,12 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, if (!xfer_buf) return -ENOMEM; - /* Remapping is not possible if xfer_buf is outside of linear map */ - xfer_buf_pa = virt_to_phys(xfer_buf); - if (WARN_ON(!page_is_ram(PFN_DOWN(xfer_buf_pa)))) { - ret = -ENXIO; - goto unmap_sync; - } dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, xfer_buf_dma, len); /* map the physical buffer into sysdev as well */ - xfer_buf_dma_sysdev = uaudio_iommu_map(MEM_XFER_BUF, dma_coherent, - xfer_buf_pa, len, &xfer_buf_sgt); + xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent, + len, &xfer_buf_sgt); if (!xfer_buf_dma_sysdev) { ret = -ENOMEM; goto unmap_sync; @@ -1143,8 +1147,8 @@ uaudio_endpoint_setup(struct snd_usb_substream *subs, sg_free_table(sgt); /* data transfer ring */ - iova = uaudio_iommu_map(MEM_XFER_RING, dma_coherent, tr_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_XFER_RING, dma_coherent, tr_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; @@ -1207,8 +1211,8 @@ static int uaudio_event_ring_setup(struct snd_usb_substream *subs, mem_info->dma = sg_dma_address(sgt->sgl); sg_free_table(sgt); - iova = uaudio_iommu_map(MEM_EVENT_RING, dma_coherent, er_pa, - PAGE_SIZE, NULL); + iova = uaudio_iommu_map_pa(MEM_EVENT_RING, dma_coherent, er_pa, + PAGE_SIZE); if (!iova) { ret = -ENOMEM; goto clear_pa; From a86556264696b797d94238d99d8284d0d34ed960 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 15 Sep 2025 16:12:40 +0200 Subject: [PATCH 382/450] dm-raid: don't set io_min and io_opt for raid1 These commands modprobe brd rd_size=1048576 vgcreate vg /dev/ram* lvcreate -m4 -L10 -n lv vg trigger the following warnings: device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency device-mapper: table: 252:10: adding target device (start sect 0 len 24576) caused an alignment inconsistency The warnings are caused by the fact that io_min is 512 and physical block size is 4096. If there's chunk-less raid, such as raid1, io_min shouldn't be set to zero because it would be raised to 512 and it would trigger the warning. Signed-off-by: Mikulas Patocka Reviewed-by: Martin K. Petersen Cc: stable@vger.kernel.org --- drivers/md/dm-raid.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 79ea85d18e24..f4b904e24328 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3813,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) From 027a7a9c07d0d759ab496a7509990aa33a4b689c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 10 Sep 2025 19:11:07 +0800 Subject: [PATCH 383/450] drbd: init queue_limits->max_hw_wzeroes_unmap_sectors parameter The parameter max_hw_wzeroes_unmap_sectors in queue_limits should be equal to max_write_zeroes_sectors if it is set to a non-zero value. However, when the backend bdev is specified, this parameter is initialized to UINT_MAX during the call to blk_set_stacking_limits(), while only max_write_zeroes_sectors is adjusted. Therefore, this discrepancy triggers a value check failure in blk_validate_limits(). Since the drvd driver doesn't yet support unmap write zeroes, so fix this failure by explicitly setting max_hw_wzeroes_unmap_sectors to zero. Fixes: 0c40d7cb5ef3 ("block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits") Signed-off-by: Zhang Yi Reviewed-by: Martin K. Petersen Reviewed-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e09930c2b226..91f3b8afb63c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1330,6 +1330,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; else lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; if ((lim.discard_granularity >> SECTOR_SHIFT) > lim.max_hw_discard_sectors) { From 7dd670db9afdca6aed4a9f99776cb72e6ed5a1f5 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 17 Sep 2025 17:06:09 +0100 Subject: [PATCH 384/450] ALSA: hda: intel-dsp-config: Prevent SEGFAULT if ACPI_HANDLE() is NULL Check in snd_intel_dsp_check_soundwire() that the pointer returned by ACPI_HANDLE() is not NULL, before passing it on to other functions. The original code assumed a non-NULL return, but if it was unexpectedly NULL it would end up passed to acpi_walk_namespace() as the start point, and would result in [ 3.219028] BUG: kernel NULL pointer dereference, address: 0000000000000018 [ 3.219029] #PF: supervisor read access in kernel mode [ 3.219030] #PF: error_code(0x0000) - not-present page [ 3.219031] PGD 0 P4D 0 [ 3.219032] Oops: Oops: 0000 [#1] SMP NOPTI [ 3.219035] CPU: 2 UID: 0 PID: 476 Comm: (udev-worker) Tainted: G S AW E 6.17.0-rc5-test #1 PREEMPT(voluntary) [ 3.219038] Tainted: [S]=CPU_OUT_OF_SPEC, [A]=OVERRIDDEN_ACPI_TABLE, [W]=WARN, [E]=UNSIGNED_MODULE [ 3.219040] RIP: 0010:acpi_ns_walk_namespace+0xb5/0x480 This problem was triggered by a bugged DSDT that the kernel couldn't parse. But it shouldn't be possible to SEGFAULT the kernel just because of some bugs in ACPI. Fixes: 0650857570d1 ("ALSA: hda: add autodetection for SoundWire") Signed-off-by: Richard Fitzgerald Signed-off-by: Takashi Iwai --- sound/hda/core/intel-dsp-config.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/hda/core/intel-dsp-config.c b/sound/hda/core/intel-dsp-config.c index c15284742899..2a9e35cddcf7 100644 --- a/sound/hda/core/intel-dsp-config.c +++ b/sound/hda/core/intel-dsp-config.c @@ -650,6 +650,8 @@ static int snd_intel_dsp_check_soundwire(struct pci_dev *pci) int ret; handle = ACPI_HANDLE(&pci->dev); + if (!handle) + return -ENODEV; ret = sdw_intel_acpi_scan(handle, &info); if (ret < 0) From ff89a4d285c82813faa0e2e386d07120ae1f9c85 Mon Sep 17 00:00:00 2001 From: Zongyao Bai Date: Tue, 16 Sep 2025 05:47:15 +0800 Subject: [PATCH 385/450] drm/xe/sysfs: Add cleanup action in xe_device_sysfs_init On partial failure, some sysfs files created before the failure might not be removed. Add common cleanup step to remove them all immediately, as is should be harmless to attempt to remove non-existing files. Fixes: 0e414bf7ad01 ("drm/xe: Expose PCIe link downgrade attributes") Cc: Lucas De Marchi Cc: Stuart Summers Cc: Shuicheng Lin Cc: Michal Wajdeczko Signed-off-by: Zongyao Bai Reviewed-by: Shuicheng Lin Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250915214716.1327379-2-zongyao.bai@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 1a869168d91f1a1a2b0db22cea0295c67908e5d8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device_sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c index bd9015761aa0..3e3b2d9033a7 100644 --- a/drivers/gpu/drm/xe/xe_device_sysfs.c +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -311,12 +311,16 @@ int xe_device_sysfs_init(struct xe_device *xe) if (xe->info.platform == XE_BATTLEMAGE) { ret = sysfs_create_files(&dev->kobj, auto_link_downgrade_attrs); if (ret) - return ret; + goto cleanup; ret = late_bind_create_files(dev); if (ret) - return ret; + goto cleanup; } return devm_add_action_or_reset(dev, xe_device_sysfs_fini, xe); + +cleanup: + xe_device_sysfs_fini(xe); + return ret; } From ae5fbbda341f92e605a9508a0fb18456155517f0 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 9 Sep 2025 15:12:40 -0700 Subject: [PATCH 386/450] drm/xe: Fix error handling if PXP fails to start Since the PXP start comes after __xe_exec_queue_init() has completed, we need to cleanup what was done in that function in case of a PXP start error. __xe_exec_queue_init calls the submission backend init() function, so we need to introduce an opposite for that. Unfortunately, while we already have a fini() function pointer, it performs other operations in addition to cleaning up what was done by the init(). Therefore, for clarity, the existing fini() has been renamed to destroy(), while a new fini() has been added to only clean up what was done by the init(), with the latter being called by the former (via xe_exec_queue_fini). Fixes: 72d479601d67 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues") Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Cc: Matthew Brost Reviewed-by: John Harrison Signed-off-by: John Harrison Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com (cherry picked from commit 626667321deb4c7a294725406faa3dd71c3d445d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_exec_queue.c | 22 +++++--- drivers/gpu/drm/xe/xe_exec_queue_types.h | 8 ++- drivers/gpu/drm/xe/xe_execlist.c | 25 +++++---- drivers/gpu/drm/xe/xe_execlist_types.h | 2 +- drivers/gpu/drm/xe/xe_guc_exec_queue_types.h | 4 +- drivers/gpu/drm/xe/xe_guc_submit.c | 54 ++++++++++++-------- 6 files changed, 73 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 8991b4aed440..c07edcda99c5 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -151,6 +151,16 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q) return err; } +static void __xe_exec_queue_fini(struct xe_exec_queue *q) +{ + int i; + + q->ops->fini(q); + + for (i = 0; i < q->width; ++i) + xe_lrc_put(q->lrc[i]); +} + struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, u32 logical_mask, u16 width, struct xe_hw_engine *hwe, u32 flags, @@ -181,11 +191,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v if (xe_exec_queue_uses_pxp(q)) { err = xe_pxp_exec_queue_add(xe->pxp, q); if (err) - goto err_post_alloc; + goto err_post_init; } return q; +err_post_init: + __xe_exec_queue_fini(q); err_post_alloc: __xe_exec_queue_free(q); return ERR_PTR(err); @@ -283,13 +295,11 @@ void xe_exec_queue_destroy(struct kref *ref) xe_exec_queue_put(eq); } - q->ops->fini(q); + q->ops->destroy(q); } void xe_exec_queue_fini(struct xe_exec_queue *q) { - int i; - /* * Before releasing our ref to lrc and xef, accumulate our run ticks * and wakeup any waiters. @@ -298,9 +308,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q) if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal)) wake_up_var(&q->xef->exec_queue.pending_removal); - for (i = 0; i < q->width; ++i) - xe_lrc_put(q->lrc[i]); - + __xe_exec_queue_fini(q); __xe_exec_queue_free(q); } diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index cc1cffb5c87f..1c9d03f2a3e5 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -166,8 +166,14 @@ struct xe_exec_queue_ops { int (*init)(struct xe_exec_queue *q); /** @kill: Kill inflight submissions for backend */ void (*kill)(struct xe_exec_queue *q); - /** @fini: Fini exec queue for submission backend */ + /** @fini: Undoes the init() for submission backend */ void (*fini)(struct xe_exec_queue *q); + /** + * @destroy: Destroy exec queue for submission backend. The backend + * function must call xe_exec_queue_fini() (which will in turn call the + * fini() backend function) to ensure the queue is properly cleaned up. + */ + void (*destroy)(struct xe_exec_queue *q); /** @set_priority: Set priority for exec queue */ int (*set_priority)(struct xe_exec_queue *q, enum xe_exec_queue_priority priority); diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c index 788f56b066b6..f83d421ac9d3 100644 --- a/drivers/gpu/drm/xe/xe_execlist.c +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q) return err; } -static void execlist_exec_queue_fini_async(struct work_struct *w) +static void execlist_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_execlist_exec_queue *exl = q->execlist; + + drm_sched_entity_fini(&exl->entity); + drm_sched_fini(&exl->sched); + + kfree(exl); +} + +static void execlist_exec_queue_destroy_async(struct work_struct *w) { struct xe_execlist_exec_queue *ee = - container_of(w, struct xe_execlist_exec_queue, fini_async); + container_of(w, struct xe_execlist_exec_queue, destroy_async); struct xe_exec_queue *q = ee->q; struct xe_execlist_exec_queue *exl = q->execlist; struct xe_device *xe = gt_to_xe(q->gt); @@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w) list_del(&exl->active_link); spin_unlock_irqrestore(&exl->port->lock, flags); - drm_sched_entity_fini(&exl->entity); - drm_sched_fini(&exl->sched); - kfree(exl); - xe_exec_queue_fini(q); } @@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q) /* NIY */ } -static void execlist_exec_queue_fini(struct xe_exec_queue *q) +static void execlist_exec_queue_destroy(struct xe_exec_queue *q) { - INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); - queue_work(system_unbound_wq, &q->execlist->fini_async); + INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async); + queue_work(system_unbound_wq, &q->execlist->destroy_async); } static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, @@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = { .init = execlist_exec_queue_init, .kill = execlist_exec_queue_kill, .fini = execlist_exec_queue_fini, + .destroy = execlist_exec_queue_destroy, .set_priority = execlist_exec_queue_set_priority, .set_timeslice = execlist_exec_queue_set_timeslice, .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout, diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h index 415140936f11..92c4ba52db0c 100644 --- a/drivers/gpu/drm/xe/xe_execlist_types.h +++ b/drivers/gpu/drm/xe/xe_execlist_types.h @@ -42,7 +42,7 @@ struct xe_execlist_exec_queue { bool has_run; - struct work_struct fini_async; + struct work_struct destroy_async; enum xe_exec_queue_priority active_priority; struct list_head active_link; diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h index a3f421e2adc0..c30c0e3ccbbb 100644 --- a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h @@ -35,8 +35,8 @@ struct xe_guc_exec_queue { struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; /** @lr_tdr: long running TDR worker */ struct work_struct lr_tdr; - /** @fini_async: do final fini async from this worker */ - struct work_struct fini_async; + /** @destroy_async: do final destroy async from this worker */ + struct work_struct destroy_async; /** @resume_time: time of last resume */ u64 resume_time; /** @state: GuC specific state for this xe_exec_queue */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index cafb47711e9b..93fc7b290b65 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1277,21 +1277,12 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) return DRM_GPU_SCHED_STAT_NO_HANG; } -static void __guc_exec_queue_fini_async(struct work_struct *w) +static void guc_exec_queue_fini(struct xe_exec_queue *q) { - struct xe_guc_exec_queue *ge = - container_of(w, struct xe_guc_exec_queue, fini_async); - struct xe_exec_queue *q = ge->q; + struct xe_guc_exec_queue *ge = q->guc; struct xe_guc *guc = exec_queue_to_guc(q); - xe_pm_runtime_get(guc_to_xe(guc)); - trace_xe_exec_queue_destroy(q); - release_guc_id(guc, q); - if (xe_exec_queue_is_lr(q)) - cancel_work_sync(&ge->lr_tdr); - /* Confirm no work left behind accessing device structures */ - cancel_delayed_work_sync(&ge->sched.base.work_tdr); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); @@ -1300,25 +1291,43 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) * (timeline name). */ kfree_rcu(ge, rcu); +} + +static void __guc_exec_queue_destroy_async(struct work_struct *w) +{ + struct xe_guc_exec_queue *ge = + container_of(w, struct xe_guc_exec_queue, destroy_async); + struct xe_exec_queue *q = ge->q; + struct xe_guc *guc = exec_queue_to_guc(q); + + xe_pm_runtime_get(guc_to_xe(guc)); + trace_xe_exec_queue_destroy(q); + + if (xe_exec_queue_is_lr(q)) + cancel_work_sync(&ge->lr_tdr); + /* Confirm no work left behind accessing device structures */ + cancel_delayed_work_sync(&ge->sched.base.work_tdr); + xe_exec_queue_fini(q); + xe_pm_runtime_put(guc_to_xe(guc)); } -static void guc_exec_queue_fini_async(struct xe_exec_queue *q) +static void guc_exec_queue_destroy_async(struct xe_exec_queue *q) { struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); - INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); + INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async); /* We must block on kernel engines so slabs are empty on driver unload */ if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q)) - __guc_exec_queue_fini_async(&q->guc->fini_async); + __guc_exec_queue_destroy_async(&q->guc->destroy_async); else - queue_work(xe->destroy_wq, &q->guc->fini_async); + queue_work(xe->destroy_wq, &q->guc->destroy_async); } -static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) +static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q) { /* * Might be done from within the GPU scheduler, need to do async as we @@ -1327,7 +1336,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) * this we and don't really care when everything is fini'd, just that it * is. */ - guc_exec_queue_fini_async(q); + guc_exec_queue_destroy_async(q); } static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) @@ -1341,7 +1350,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) if (exec_queue_registered(q)) disable_scheduling_deregister(guc, q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) @@ -1574,14 +1583,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q, #define STATIC_MSG_CLEANUP 0 #define STATIC_MSG_SUSPEND 1 #define STATIC_MSG_RESUME 2 -static void guc_exec_queue_fini(struct xe_exec_queue *q) +static void guc_exec_queue_destroy(struct xe_exec_queue *q) { struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q)) guc_exec_queue_add_msg(q, msg, CLEANUP); else - __guc_exec_queue_fini(exec_queue_to_guc(q), q); + __guc_exec_queue_destroy(exec_queue_to_guc(q), q); } static int guc_exec_queue_set_priority(struct xe_exec_queue *q, @@ -1711,6 +1720,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = { .init = guc_exec_queue_init, .kill = guc_exec_queue_kill, .fini = guc_exec_queue_fini, + .destroy = guc_exec_queue_destroy, .set_priority = guc_exec_queue_set_priority, .set_timeslice = guc_exec_queue_set_timeslice, .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, @@ -1732,7 +1742,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else if (exec_queue_destroyed(q)) - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } if (q->guc->suspend_pending) { set_exec_queue_suspended(q); @@ -1989,7 +1999,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q) if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) xe_exec_queue_put(q); else - __guc_exec_queue_fini(guc, q); + __guc_exec_queue_destroy(guc, q); } int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len) From a8ba87f04ca9cdec06776ce92dce1395026dc3bb Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:26 +0000 Subject: [PATCH 387/450] bonding: don't set oif to bond dev when getting NS target destination Unlike IPv4, IPv6 routing strictly requires the source address to be valid on the outgoing interface. If the NS target is set to a remote VLAN interface, and the source address is also configured on a VLAN over a bond interface, setting the oif to the bond device will fail to retrieve the correct destination route. Fix this by not setting the oif to the bond device when retrieving the NS target destination. This allows the correct destination device (the VLAN interface) to be determined, so that bond_verify_device_path can return the proper VLAN tags for sending NS messages. Reported-by: David Wilder Closes: https://lore.kernel.org/netdev/aGOKggdfjv0cApTO@fedora/ Suggested-by: Jay Vosburgh Tested-by: David Wilder Acked-by: Jay Vosburgh Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8832bc9f107b..57be04f6cb11 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3356,7 +3356,6 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; - fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { From dc5f94b1ec8f9f65d1ad7a5f45fcac740544e35f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:27 +0000 Subject: [PATCH 388/450] selftests: bonding: add vlan over bond testing Add a vlan over bond testing to make sure arp/ns target works. Also change all the configs to mudules. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 58 +++++++++++++++++++ .../selftests/drivers/net/bonding/config | 1 + 2 files changed, 59 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index e3f3cc803b56..187b478d0ddf 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -8,6 +8,7 @@ ALL_TESTS=" arp_validate num_grat_arp fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -508,7 +509,64 @@ fail_over_mac() log_test "fail_over_mac 2" "failover: backup slave mac inherit" check_first_slave_random_mac log_test "fail_over_mac 2" "first slave mac random" +} +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 4d16a69ffc65..832fa1caeb66 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -10,3 +10,4 @@ CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m From dc3382fffdec2c1d6df5836c88fa37b39cd8651e Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 16 Sep 2025 15:58:16 +0800 Subject: [PATCH 389/450] tracing: kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal() A crash was observed with the following output: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 2899 Comm: syz.2.399 Not tainted 6.17.0-rc5+ #5 PREEMPT(none) RIP: 0010:trace_kprobe_create_internal+0x3fc/0x1440 kernel/trace/trace_kprobe.c:911 Call Trace: trace_kprobe_create_cb+0xa2/0xf0 kernel/trace/trace_kprobe.c:1089 trace_probe_create+0xf1/0x110 kernel/trace/trace_probe.c:2246 dyn_event_create+0x45/0x70 kernel/trace/trace_dynevent.c:128 create_or_delete_trace_kprobe+0x5e/0xc0 kernel/trace/trace_kprobe.c:1107 trace_parse_run_command+0x1a5/0x330 kernel/trace/trace.c:10785 vfs_write+0x2b6/0xd00 fs/read_write.c:684 ksys_write+0x129/0x240 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x5d/0x2d0 arch/x86/entry/syscall_64.c:94 Function kmemdup() may return NULL in trace_kprobe_create_internal(), add check for it's return value. Link: https://lore.kernel.org/all/20250916075816.3181175-1-wangliang74@huawei.com/ Fixes: 33b4e38baa03 ("tracing: kprobe-event: Allocate string buffers from heap") Signed-off-by: Wang Liang Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb91..fa60362a3f31 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { From a72175c985132885573593222a7b088cf49b07ae Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Tue, 16 Sep 2025 06:32:07 -0700 Subject: [PATCH 390/450] octeon_ep: fix VF MAC address lifecycle handling Currently, VF MAC address info is not updated when the MAC address is configured from VF, and it is not cleared when the VF is removed. This leads to stale or missing MAC information in the PF, which may cause incorrect state tracking or inconsistencies when VFs are hot-plugged or reassigned. Fix this by: - storing the VF MAC address in the PF when it is set from VF - clearing the stored VF MAC address when the VF is removed This ensures that the PF always has correct VF MAC state. Fixes: cde29af9e68e ("octeon_ep: add PF-VF mailbox communication") Signed-off-by: Sathesh B Edara Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250916133207.21737-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index ebecdd29f3bd..0867fab61b19 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -196,6 +196,7 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, vf_id); return; } + ether_addr_copy(oct->vf_info[vf_id].mac_addr, rsp->s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -205,6 +206,8 @@ static void octep_pfvf_dev_remove(struct octep_device *oct, u32 vf_id, { int err; + /* Reset VF-specific information maintained by the PF */ + memset(&oct->vf_info[vf_id], 0, sizeof(struct octep_pfvf_info)); err = octep_ctrl_net_dev_remove(oct, vf_id); if (err) { rsp->s.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; From 45c8a6cc2bcd780e634a6ba8e46bffbdf1fc5c01 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:46 +0000 Subject: [PATCH 391/450] tcp: Clear tcp_sk(sk)->fastopen_rsk in tcp_disconnect(). syzbot reported the splat below where a socket had tcp_sk(sk)->fastopen_rsk in the TCP_ESTABLISHED state. [0] syzbot reused the server-side TCP Fast Open socket as a new client before the TFO socket completes 3WHS: 1. accept() 2. connect(AF_UNSPEC) 3. connect() to another destination As of accept(), sk->sk_state is TCP_SYN_RECV, and tcp_disconnect() changes it to TCP_CLOSE and makes connect() possible, which restarts timers. Since tcp_disconnect() forgot to clear tcp_sk(sk)->fastopen_rsk, the retransmit timer triggered the warning and the intended packet was not retransmitted. Let's call reqsk_fastopen_remove() in tcp_disconnect(). [0]: WARNING: CPU: 2 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Modules linked in: CPU: 2 UID: 0 PID: 0 Comm: swapper/2 Not tainted 6.17.0-rc5-g201825fb4278 #62 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Code: 41 55 41 54 55 53 48 8b af b8 08 00 00 48 89 fb 48 85 ed 0f 84 55 01 00 00 0f b6 47 12 3c 03 74 0c 0f b6 47 12 3c 04 74 04 90 <0f> 0b 90 48 8b 85 c0 00 00 00 48 89 ef 48 8b 40 30 e8 6a 4f 06 3e RSP: 0018:ffffc900002f8d40 EFLAGS: 00010293 RAX: 0000000000000002 RBX: ffff888106911400 RCX: 0000000000000017 RDX: 0000000002517619 RSI: ffffffff83764080 RDI: ffff888106911400 RBP: ffff888106d5c000 R08: 0000000000000001 R09: ffffc900002f8de8 R10: 00000000000000c2 R11: ffffc900002f8ff8 R12: ffff888106911540 R13: ffff888106911480 R14: ffff888106911840 R15: ffffc900002f8de0 FS: 0000000000000000(0000) GS:ffff88907b768000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8044d69d90 CR3: 0000000002c30003 CR4: 0000000000370ef0 Call Trace: tcp_write_timer (net/ipv4/tcp_timer.c:738) call_timer_fn (kernel/time/timer.c:1747) __run_timers (kernel/time/timer.c:1799 kernel/time/timer.c:2372) timer_expire_remote (kernel/time/timer.c:2385 kernel/time/timer.c:2376 kernel/time/timer.c:2135) tmigr_handle_remote_up (kernel/time/timer_migration.c:944 kernel/time/timer_migration.c:1035) __walk_groups.isra.0 (kernel/time/timer_migration.c:533 (discriminator 1)) tmigr_handle_remote (kernel/time/timer_migration.c:1096) handle_softirqs (./arch/x86/include/asm/jump_label.h:36 ./include/trace/events/irq.h:142 kernel/softirq.c:580) irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680 kernel/softirq.c:696) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 (discriminator 35) arch/x86/kernel/apic/apic.c:1050 (discriminator 35)) Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc55..ad76556800f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; From 1fd0362262baff9381615a5b346298abbc165ba3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:47 +0000 Subject: [PATCH 392/450] selftest: packetdrill: Add tcp_fastopen_server_reset-after-disconnect.pkt. The test reproduces the scenario explained in the previous patch. Without the patch, the test triggers the warning and cannot see the last retransmitted packet. # ./ksft_runner.sh tcp_fastopen_server_reset-after-disconnect.pkt TAP version 13 1..2 [ 29.229250] ------------[ cut here ]------------ [ 29.231414] WARNING: CPU: 26 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer+0x32/0x9f0 ... tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 1 ipv4 tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 2 ipv6 # Totals: pass:0 fail:2 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- ...fastopen_server_reset-after-disconnect.pkt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 000000000000..26794e7ddfd5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 + +0 > S. 0:0(0) ack 11 win 65535 + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 From 26caeae9fb482ec443753b4e3307e5122b60b850 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Fri, 5 Sep 2025 16:56:33 -0700 Subject: [PATCH 393/450] drm/xe/guc: Set RCS/CCS yield policy All recent platforms (including all the ones officially supported by the Xe driver) do not allow concurrent execution of RCS and CCS workloads from different address spaces, with the HW blocking the context switch when it detects such a scenario. The DUAL_QUEUE flag helps with this, by causing the GuC to not submit a context it knows will not be able to execute. This, however, causes a new problem: if RCS and CCS queues have pending workloads from different address spaces, the GuC needs to choose from which of the 2 queues to pick the next workload to execute. By default, the GuC prioritizes RCS submissions over CCS ones, which can lead to CCS workloads being significantly (or completely) starved of execution time. The driver can tune this by setting a dedicated scheduling policy KLV; this KLV allows the driver to specify a quantum (in ms) and a ratio (percentage value between 0 and 100), and the GuC will prioritize the CCS for that percentage of each quantum. Given that we want to guarantee enough RCS throughput to avoid missing frames, we set the yield policy to 20% of each 80ms interval. v2: updated quantum and ratio, improved comment, use xe_guc_submit_disable in gt_sanitize Fixes: d9a1ae0d17bd ("drm/xe/guc: Enable WA_DUAL_QUEUE for newer platforms") Signed-off-by: Daniele Ceraolo Spurio Cc: Matthew Brost Cc: John Harrison Cc: Vinay Belgaumkar Reviewed-by: John Harrison Tested-by: Vinay Belgaumkar Link: https://lore.kernel.org/r/20250905235632.3333247-2-daniele.ceraolospurio@intel.com (cherry picked from commit 88434448438e4302e272b2a2b810b42e05ea024b) Signed-off-by: Rodrigo Vivi [Rodrigo added #include "xe_guc_submit.h" while backporting] --- drivers/gpu/drm/xe/abi/guc_actions_abi.h | 1 + drivers/gpu/drm/xe/abi/guc_klvs_abi.h | 25 +++++++++ drivers/gpu/drm/xe/xe_gt.c | 3 +- drivers/gpu/drm/xe/xe_guc.c | 6 +-- drivers/gpu/drm/xe/xe_guc_submit.c | 66 ++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_guc_submit.h | 2 + 6 files changed, 98 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h index 81eb046aeebf..b9f67d7a00d8 100644 --- a/drivers/gpu/drm/xe/abi/guc_actions_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h @@ -117,6 +117,7 @@ enum xe_guc_action { XE_GUC_ACTION_ENTER_S_STATE = 0x501, XE_GUC_ACTION_EXIT_S_STATE = 0x502, XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE = 0x506, + XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV = 0x509, XE_GUC_ACTION_SCHED_CONTEXT = 0x1000, XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET = 0x1001, XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002, diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h index 0366a9da5977..d7719d0e36ca 100644 --- a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h @@ -17,6 +17,7 @@ * | 0 | 31:16 | **KEY** - KLV key identifier | * | | | - `GuC Self Config KLVs`_ | * | | | - `GuC Opt In Feature KLVs`_ | + * | | | - `GuC Scheduling Policies KLVs`_ | * | | | - `GuC VGT Policy KLVs`_ | * | | | - `GuC VF Configuration KLVs`_ | * | | | | @@ -152,6 +153,30 @@ enum { #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_KEY 0x4003 #define GUC_KLV_OPT_IN_FEATURE_DYNAMIC_INHIBIT_CONTEXT_SWITCH_LEN 0u +/** + * DOC: GuC Scheduling Policies KLVs + * + * `GuC KLV`_ keys available for use with UPDATE_SCHEDULING_POLICIES_KLV. + * + * _`GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD` : 0x1001 + * Some platforms do not allow concurrent execution of RCS and CCS + * workloads from different address spaces. By default, the GuC prioritizes + * RCS submissions over CCS ones, which can lead to CCS workloads being + * significantly (or completely) starved of execution time. This KLV allows + * the driver to specify a quantum (in ms) and a ratio (percentage value + * between 0 and 100), and the GuC will prioritize the CCS for that + * percentage of each quantum. For example, specifying 100ms and 30% will + * make the GuC prioritize the CCS for 30ms of every 100ms. + * Note that this does not necessarly mean that RCS and CCS engines will + * only be active for their percentage of the quantum, as the restriction + * only kicks in if both classes are fully busy with non-compatible address + * spaces; i.e., if one engine is idle or running the same address space, + * a pending job on the other engine will still be submitted to the HW no + * matter what the ratio is + */ +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_KEY 0x1001 +#define GUC_KLV_SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD_LEN 2u + /** * DOC: GuC VGT Policy KLVs * diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index c8eda36546d3..17634195cdc2 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -41,6 +41,7 @@ #include "xe_gt_topology.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_pc.h" +#include "xe_guc_submit.h" #include "xe_hw_fence.h" #include "xe_hw_engine_class_sysfs.h" #include "xe_irq.h" @@ -97,7 +98,7 @@ void xe_gt_sanitize(struct xe_gt *gt) * FIXME: if xe_uc_sanitize is called here, on TGL driver will not * reload */ - gt->uc.guc.submission_state.enabled = false; + xe_guc_submit_disable(>->uc.guc); } static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index b1d1d6da3758..270fc3792493 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -880,9 +880,7 @@ int xe_guc_post_load_init(struct xe_guc *guc) return ret; } - guc->submission_state.enabled = true; - - return 0; + return xe_guc_submit_enable(guc); } int xe_guc_reset(struct xe_guc *guc) @@ -1579,7 +1577,7 @@ void xe_guc_sanitize(struct xe_guc *guc) { xe_uc_fw_sanitize(&guc->fw); xe_guc_ct_disable(&guc->ct); - guc->submission_state.enabled = false; + xe_guc_submit_disable(guc); } int xe_guc_reset_prepare(struct xe_guc *guc) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 93fc7b290b65..0104afbc941c 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -32,6 +32,7 @@ #include "xe_guc_ct.h" #include "xe_guc_exec_queue_types.h" #include "xe_guc_id_mgr.h" +#include "xe_guc_klv_helpers.h" #include "xe_guc_submit_types.h" #include "xe_hw_engine.h" #include "xe_hw_fence.h" @@ -316,6 +317,71 @@ int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids) return drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc); } +/* + * Given that we want to guarantee enough RCS throughput to avoid missing + * frames, we set the yield policy to 20% of each 80ms interval. + */ +#define RC_YIELD_DURATION 80 /* in ms */ +#define RC_YIELD_RATIO 20 /* in percent */ +static u32 *emit_render_compute_yield_klv(u32 *emit) +{ + *emit++ = PREP_GUC_KLV_TAG(SCHEDULING_POLICIES_RENDER_COMPUTE_YIELD); + *emit++ = RC_YIELD_DURATION; + *emit++ = RC_YIELD_RATIO; + + return emit; +} + +#define SCHEDULING_POLICY_MAX_DWORDS 16 +static int guc_init_global_schedule_policy(struct xe_guc *guc) +{ + u32 data[SCHEDULING_POLICY_MAX_DWORDS]; + u32 *emit = data; + u32 count = 0; + int ret; + + if (GUC_SUBMIT_VER(guc) < MAKE_GUC_VER(1, 1, 0)) + return 0; + + *emit++ = XE_GUC_ACTION_UPDATE_SCHEDULING_POLICIES_KLV; + + if (CCS_MASK(guc_to_gt(guc))) + emit = emit_render_compute_yield_klv(emit); + + count = emit - data; + if (count > 1) { + xe_assert(guc_to_xe(guc), count <= SCHEDULING_POLICY_MAX_DWORDS); + + ret = xe_guc_ct_send_block(&guc->ct, data, count); + if (ret < 0) { + xe_gt_err(guc_to_gt(guc), + "failed to enable GuC sheduling policies: %pe\n", + ERR_PTR(ret)); + return ret; + } + } + + return 0; +} + +int xe_guc_submit_enable(struct xe_guc *guc) +{ + int ret; + + ret = guc_init_global_schedule_policy(guc); + if (ret) + return ret; + + guc->submission_state.enabled = true; + + return 0; +} + +void xe_guc_submit_disable(struct xe_guc *guc) +{ + guc->submission_state.enabled = false; +} + static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa_count) { int i; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h index 9b71a986c6ca..0d126b807c10 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.h +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -13,6 +13,8 @@ struct xe_exec_queue; struct xe_guc; int xe_guc_submit_init(struct xe_guc *guc, unsigned int num_ids); +int xe_guc_submit_enable(struct xe_guc *guc); +void xe_guc_submit_disable(struct xe_guc *guc); int xe_guc_submit_reset_prepare(struct xe_guc *guc); void xe_guc_submit_reset_wait(struct xe_guc *guc); From f57e53ea252363234f86674db475839e5b87102e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 10 Sep 2025 11:49:05 +0200 Subject: [PATCH 394/450] smb: client: let recv_done verify data_offset, data_length and remaining_data_length This is inspired by the related server fixes. Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Reviewed-by: Namjae Jeon Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 02d6db431fd4..dafa3ed4a630 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -453,9 +453,12 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) struct smbdirect_recv_io *response = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); struct smbdirect_socket *sc = response->socket; + struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbd_connection *info = container_of(sc, struct smbd_connection, socket); - int data_length = 0; + u32 data_offset = 0; + u32 data_length = 0; + u32 remaining_data_length = 0; log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n", response, sc->recv_io.expected, wc->status, wc->opcode, @@ -487,7 +490,22 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) /* SMBD data transfer packet */ case SMBDIRECT_EXPECT_DATA_TRANSFER: data_transfer = smbdirect_recv_io_payload(response); + + if (wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding)) + goto error; + + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); data_length = le32_to_cpu(data_transfer->data_length); + if (wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length) + goto error; + + if (remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) + goto error; if (data_length) { if (sc->recv_io.reassembly.full_packet_received) From 93ed9a2951308db374cba4562533dde97bac70d3 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 17 Sep 2025 16:03:22 -0300 Subject: [PATCH 395/450] smb: client: fix filename matching of deferred files Fix the following case where the client would end up closing both deferred files (foo.tmp & foo) after unlink(foo) due to strstr() call in cifs_close_deferred_file_under_dentry(): fd1 = openat(AT_FDCWD, "foo", O_WRONLY|O_CREAT|O_TRUNC, 0666); fd2 = openat(AT_FDCWD, "foo.tmp", O_WRONLY|O_CREAT|O_TRUNC, 0666); close(fd1); close(fd2); unlink("foo"); Fixes: e3fc065682eb ("cifs: Deferred close performance improvements") Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: Enzo Matsumiya Cc: Frank Sorenson Cc: David Howells Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 4 ++-- fs/smb/client/inode.c | 6 +++--- fs/smb/client/misc.c | 36 +++++++++++++++--------------------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c34c533b2efa..e8fba98690ce 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -312,8 +312,8 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); -extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, - const char *path); +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + struct dentry *dentry); extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode, const char *path); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 11d442e8b3d6..1703f1285d36 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1984,7 +1984,7 @@ static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyren } netfs_wait_for_outstanding_io(inode); - cifs_close_deferred_file_under_dentry(tcon, full_path); + cifs_close_deferred_file_under_dentry(tcon, dentry); #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { @@ -2538,10 +2538,10 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file_under_dentry(tcon, from_name); + cifs_close_deferred_file_under_dentry(tcon, source_dentry); if (d_inode(target_dentry) != NULL) { netfs_wait_for_outstanding_io(d_inode(target_dentry)); - cifs_close_deferred_file_under_dentry(tcon, to_name); + cifs_close_deferred_file_under_dentry(tcon, target_dentry); } rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index da23cc12a52c..dda6dece802a 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -832,33 +832,28 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } -void -cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) + +void cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, + struct dentry *dentry) { - struct cifsFileInfo *cfile; struct file_list *tmp_list, *tmp_next_list; - void *page; - const char *full_path; + struct cifsFileInfo *cfile; LIST_HEAD(file_head); - page = alloc_dentry_path(); spin_lock(&tcon->open_file_lock); list_for_each_entry(cfile, &tcon->openFileList, tlist) { - full_path = build_path_from_dentry(cfile->dentry, page); - if (strstr(full_path, path)) { - if (delayed_work_pending(&cfile->deferred)) { - if (cancel_delayed_work(&cfile->deferred)) { - spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - cifs_del_deferred_close(cfile); - spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + if ((cfile->dentry == dentry) && + delayed_work_pending(&cfile->deferred) && + cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); - tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; - tmp_list->cfile = cfile; - list_add_tail(&tmp_list->list, &file_head); - } - } + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); } } spin_unlock(&tcon->open_file_lock); @@ -868,7 +863,6 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) list_del(&tmp_list->list); kfree(tmp_list); } - free_dentry_path(page); } /* From bac28f604c7699727b2fecf14c3a54668bbe458e Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 12:58:21 +0200 Subject: [PATCH 396/450] smb: client: use disable[_delayed]_work_sync in smbdirect.c This makes it safer during the disconnect and avoids requeueing. It's ok to call disable[delayed_]work[_sync]() more than once. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 050b8c374019 ("smbd: Make upper layer decide when to destroy the transport") Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Fixes: c7398583340a ("CIFS: SMBD: Implement RDMA memory registration") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index dafa3ed4a630..1f8de8866c06 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1353,7 +1353,7 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->ib.qp = NULL; log_rdma_event(INFO, "cancelling idle timer\n"); - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&info->idle_timer_work); /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); @@ -1726,7 +1726,7 @@ static struct smbd_connection *_smbd_get_connection( return NULL; negotiation_failed: - cancel_delayed_work_sync(&info->idle_timer_work); + disable_delayed_work_sync(&info->idle_timer_work); destroy_caches_and_workqueue(info); sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; rdma_disconnect(sc->rdma.cm_id); @@ -2085,7 +2085,7 @@ static void destroy_mr_list(struct smbd_connection *info) struct smbdirect_socket *sc = &info->socket; struct smbd_mr *mr, *tmp; - cancel_work_sync(&info->mr_recovery_work); + disable_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, From d9dcbbcf9145b68aa85c40947311a6907277e097 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 13:03:19 +0200 Subject: [PATCH 397/450] smb: client: let smbd_destroy() call disable_work_sync(&info->post_send_credits_work) In smbd_destroy() we may destroy the memory so we better wait until post_send_credits_work is no longer pending and will never be started again. I actually just hit the case using rxe: WARNING: CPU: 0 PID: 138 at drivers/infiniband/sw/rxe/rxe_verbs.c:1032 rxe_post_recv+0x1ee/0x480 [rdma_rxe] ... [ 5305.686979] [ T138] smbd_post_recv+0x445/0xc10 [cifs] [ 5305.687135] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687149] [ T138] ? __kasan_check_write+0x14/0x30 [ 5305.687185] [ T138] ? __pfx_smbd_post_recv+0x10/0x10 [cifs] [ 5305.687329] [ T138] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 5305.687356] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687368] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687378] [ T138] ? _raw_spin_unlock_irqrestore+0x11/0x60 [ 5305.687389] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687399] [ T138] ? get_receive_buffer+0x168/0x210 [cifs] [ 5305.687555] [ T138] smbd_post_send_credits+0x382/0x4b0 [cifs] [ 5305.687701] [ T138] ? __pfx_smbd_post_send_credits+0x10/0x10 [cifs] [ 5305.687855] [ T138] ? __pfx___schedule+0x10/0x10 [ 5305.687865] [ T138] ? __pfx__raw_spin_lock_irq+0x10/0x10 [ 5305.687875] [ T138] ? queue_delayed_work_on+0x8e/0xa0 [ 5305.687889] [ T138] process_one_work+0x629/0xf80 [ 5305.687908] [ T138] ? srso_alias_return_thunk+0x5/0xfbef5 [ 5305.687917] [ T138] ? __kasan_check_write+0x14/0x30 [ 5305.687933] [ T138] worker_thread+0x87f/0x1570 ... It means rxe_post_recv was called after rdma_destroy_qp(). This happened because put_receive_buffer() was triggered by ib_drain_qp() and called: queue_work(info->workqueue, &info->post_send_credits_work); Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 1f8de8866c06..1f3d64145863 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1347,6 +1347,9 @@ void smbd_destroy(struct TCP_Server_Info *server) sc->status == SMBDIRECT_SOCKET_DISCONNECTED); } + log_rdma_event(INFO, "cancelling post_send_credits_work\n"); + disable_work_sync(&info->post_send_credits_work); + log_rdma_event(INFO, "destroying qp\n"); ib_drain_qp(sc->ib.qp); rdma_destroy_qp(sc->rdma.cm_id); From 96fa515e70f3e4b98685ef8cac9d737fc62f10e1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Sep 2025 07:54:06 +0930 Subject: [PATCH 398/450] btrfs: tree-checker: fix the incorrect inode ref size check [BUG] Inside check_inode_ref(), we need to make sure every structure, including the btrfs_inode_extref header, is covered by the item. But our code is incorrectly using "sizeof(iref)", where @iref is just a pointer. This means "sizeof(iref)" will always be "sizeof(void *)", which is much smaller than "sizeof(struct btrfs_inode_extref)". This will allow some bad inode extrefs to sneak in, defeating tree-checker. [FIX] Fix the typo by calling "sizeof(*iref)", which is the same as "sizeof(struct btrfs_inode_extref)", and will be the correct behavior we want. Fixes: 71bf92a9b877 ("btrfs: tree-checker: Add check for INODE_REF") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Johannes Thumshirn Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 0f556f4de3f9..a997c7cc35a2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1756,10 +1756,10 @@ static int check_inode_ref(struct extent_buffer *leaf, while (ptr < end) { u16 namelen; - if (unlikely(ptr + sizeof(iref) > end)) { + if (unlikely(ptr + sizeof(*iref) > end)) { inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", - ptr, end, sizeof(iref)); + ptr, end, sizeof(*iref)); return -EUCLEAN; } From ed4e6b5d644c4dd2bc2872ffec036b7da0ec2e27 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Sep 2025 08:37:47 +0200 Subject: [PATCH 399/450] btrfs: ref-verify: handle damaged extent root tree Syzbot hits a problem with enabled ref-verify, ignorebadroots and a fuzzed/damaged extent tree. There's no fallback option like in other places that can deal with it so disable the whole ref-verify as it is just a debugging feature. Reported-by: syzbot+9c3e0cdfbfe351b0bc0e@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000001b6052062139be1c@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 3871c3a6c743..9f1858b42c0e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -980,11 +980,18 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; + extent_root = btrfs_extent_root(fs_info, 0); + /* If the extent tree is damaged we cannot ignore it (IGNOREBADROOTS). */ + if (IS_ERR(extent_root)) { + btrfs_warn(fs_info, "ref-verify: extent tree not available, disabling"); + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - extent_root = btrfs_extent_root(fs_info, 0); eb = btrfs_read_lock_root_node(extent_root); level = btrfs_header_level(eb); path->nodes[level] = eb; From 9574b2330dbd2b5459b74d3b5e9619d39299fc6f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Sep 2025 15:42:41 +0800 Subject: [PATCH 400/450] crypto: af_alg - Set merge to zero early in af_alg_sendmsg If an error causes af_alg_sendmsg to abort, ctx->merge may contain a garbage value from the previous loop. This may then trigger a crash on the next entry into af_alg_sendmsg when it attempts to do a merge that can't be done. Fix this by setting ctx->merge to zero near the start of the loop. Fixes: 8ff590903d5 ("crypto: algif_skcipher - User-space interface for skcipher operations") Reported-by: Muhammad Alifa Ramdhan Reported-by: Bing-Jhong Billy Jheng Signed-off-by: Herbert Xu --- crypto/af_alg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 0da7c1ac778a..407f2c238f2c 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1019,6 +1019,8 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, continue; } + ctx->merge = 0; + if (!af_alg_writable(sk)) { err = af_alg_wait_for_wmem(sk, msg->msg_flags); if (err) @@ -1058,7 +1060,6 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, ctx->used += plen; copied += plen; size -= plen; - ctx->merge = 0; } else { do { struct page *pg; From 1b34cbbf4f011a121ef7b2d7d6e6920a036d5285 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 16 Sep 2025 17:20:59 +0800 Subject: [PATCH 401/450] crypto: af_alg - Disallow concurrent writes in af_alg_sendmsg Issuing two writes to the same af_alg socket is bogus as the data will be interleaved in an unpredictable fashion. Furthermore, concurrent writes may create inconsistencies in the internal socket state. Disallow this by adding a new ctx->write field that indiciates exclusive ownership for writing. Fixes: 8ff590903d5 ("crypto: algif_skcipher - User-space interface for skcipher operations") Reported-by: Muhammad Alifa Ramdhan Reported-by: Bing-Jhong Billy Jheng Signed-off-by: Herbert Xu --- crypto/af_alg.c | 7 +++++++ include/crypto/if_alg.h | 10 ++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 407f2c238f2c..ca6fdcc6c54a 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -970,6 +970,12 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, } lock_sock(sk); + if (ctx->write) { + release_sock(sk); + return -EBUSY; + } + ctx->write = true; + if (ctx->init && !ctx->more) { if (ctx->used) { err = -EINVAL; @@ -1105,6 +1111,7 @@ int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unlock: af_alg_data_wakeup(sk); + ctx->write = false; release_sock(sk); return copied ?: err; diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index f7b3b93f3a49..0c70f3a55575 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -135,6 +135,7 @@ struct af_alg_async_req { * SG? * @enc: Cryptographic operation to be performed when * recvmsg is invoked. + * @write: True if we are in the middle of a write. * @init: True if metadata has been sent. * @len: Length of memory allocated for this data structure. * @inflight: Non-zero when AIO requests are in flight. @@ -151,10 +152,11 @@ struct af_alg_ctx { size_t used; atomic_t rcvused; - bool more; - bool merge; - bool enc; - bool init; + u32 more:1, + merge:1, + enc:1, + write:1, + init:1; unsigned int len; From 0aeb54ac4cd5cf8f60131b4d9ec0b6dc9c27b20d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:13 -0700 Subject: [PATCH 402/450] tls: make sure to abort the stream if headers are bogus Normally we wait for the socket to buffer up the whole record before we service it. If the socket has a tiny buffer, however, we read out the data sooner, to prevent connection stalls. Make sure that we abort the connection when we find out late that the record is actually invalid. Retrying the parsing is fine in itself but since we copy some more data each time before we parse we can overflow the allocated skb space. Constructing a scenario in which we're under pressure without enough data in the socket to parse the length upfront is quite hard. syzbot figured out a way to do this by serving us the header in small OOB sends, and then filling in the recvbuf with a large normal send. Make sure that tls_rx_msg_size() aborts strp, if we reach an invalid record there's really no way to recover. Reported-by: Lee Jones Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 14 +++++++++----- net/tls/tls_sw.c | 3 +-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index 4e077068e6d9..e4c42731ce39 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index d71643b494a1..98e12f0ff57e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -520,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e..daac9fd4be7e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2474,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } From 4c05c7ed880fb58790731fb53571af67b7632d87 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:14 -0700 Subject: [PATCH 403/450] selftests: tls: test skb copy under mem pressure and OOB Add a test which triggers mem pressure via OOB writes. Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tls.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0f5640d8dc7f..dd093f9df6f1 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2770,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; From b98b208300573f4ab29507f81194a6030b208444 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 25 Aug 2025 19:56:26 +0930 Subject: [PATCH 404/450] btrfs: reject invalid compression level Inspired by recent changes to compression level parsing in 6db1df415d73fc ("btrfs: accept and ignore compression level for lzo") it turns out that we do not do any extra validation for compression level input string, thus allowing things like "compress=lzo:invalid" to be accepted without warnings. Although we accept levels that are beyond the supported algorithm ranges, accepting completely invalid level specification is not correct. Fix the too loose checks for compression level, by doing proper error handling of kstrtoint(), so that we will reject not only too large values (beyond int range) but also completely wrong levels like "lzo:invalid". Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 22 +++++++++++++--------- fs/btrfs/compression.h | 2 +- fs/btrfs/super.c | 27 +++++++++++++++++++-------- 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d09d622016ef..35e3071cec06 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1616,25 +1616,29 @@ int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end) } /* - * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level. Negative level - * numbers are allowed. + * Convert the compression suffix (eg. after "zlib" starting with ":") to level. + * + * If the resulting level exceeds the algo's supported levels, it will be clamped. + * + * Return <0 if no valid string can be found. + * Return 0 if everything is fine. */ -int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret) { int level = 0; int ret; - if (!type) + if (!type) { + *level_ret = btrfs_compress_set_level(type, level); return 0; + } if (str[0] == ':') { ret = kstrtoint(str + 1, 10, &level); if (ret) - level = 0; + return ret; } - level = btrfs_compress_set_level(type, level); - - return level; + *level_ret = btrfs_compress_set_level(type, level); + return 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1b38e707bbd9..7b41b2b5ff44 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -102,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e708faf1892f..a3f9f9e1c02a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -276,6 +276,7 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, const struct fs_parameter *param, int opt) { const char *string = param->string; + int ret; /* * Provide the same semantics as older kernels that don't use fs @@ -294,15 +295,19 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zlib", true)) { ctx->compress_type = BTRFS_COMPRESS_ZLIB; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "lzo", true)) { ctx->compress_type = BTRFS_COMPRESS_LZO; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, - string + 3); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_LZO, string + 3, + &ctx->compress_level); + if (ret < 0) + goto error; if (string[3] == ':' && string[4]) btrfs_warn(NULL, "Compression level ignored for LZO"); btrfs_set_opt(ctx->mount_opt, COMPRESS); @@ -310,8 +315,10 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, NODATASUM); } else if (btrfs_match_compress_type(string, "zstd", true)) { ctx->compress_type = BTRFS_COMPRESS_ZSTD; - ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, - string + 4); + ret = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, string + 4, + &ctx->compress_level); + if (ret < 0) + goto error; btrfs_set_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, NODATACOW); btrfs_clear_opt(ctx->mount_opt, NODATASUM); @@ -322,10 +329,14 @@ static int btrfs_parse_compress(struct btrfs_fs_context *ctx, btrfs_clear_opt(ctx->mount_opt, COMPRESS); btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); } else { - btrfs_err(NULL, "unrecognized compression value %s", string); - return -EINVAL; + ret = -EINVAL; + goto error; } return 0; +error: + btrfs_err(NULL, "failed to parse compression option '%s'", string); + return ret; + } static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) From baad7830ee9a56756b3857348452fe756cb0a702 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:36 +0800 Subject: [PATCH 405/450] objtool/LoongArch: Mark types based on break immediate code If the break immediate code is 0, it should mark the type as INSN_TRAP. If the break immediate code is 1, it should mark the type as INSN_BUG. While at it, format the code style and add the code comment for nop. Cc: stable@vger.kernel.org Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/arch/loongarch/decode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index b6fdc68053cc..428ba6de3821 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -310,10 +310,16 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec if (decode_insn_reg2i16_fomat(inst, insn)) return 0; - if (inst.word == 0) + if (inst.word == 0) { + /* andi $zero, $zero, 0x0 */ insn->type = INSN_NOP; - else if (inst.reg0i15_format.opcode == break_op) { - /* break */ + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x0) { + /* break 0x0 */ + insn->type = INSN_TRAP; + } else if (inst.reg0i15_format.opcode == break_op && + inst.reg0i15_format.immediate == 0x1) { + /* break 0x1 */ insn->type = INSN_BUG; } else if (inst.reg2_format.opcode == ertn_op) { /* ertn */ From 539d7344d4feaea37e05863e9aa86bd31f28e46f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:36 +0800 Subject: [PATCH 406/450] objtool/LoongArch: Mark special atomic instruction as INSN_BUG type When compiling with LLVM and CONFIG_RUST is set, there exists the following objtool warning: rust/compiler_builtins.o: warning: objtool: __rust__unordsf2(): unexpected end of section .text.unlikely. objdump shows that the end of section .text.unlikely is an atomic instruction: amswap.w $zero, $ra, $zero According to the LoongArch Reference Manual, if the amswap.w atomic memory access instruction has the same register number as rd and rj, the execution will trigger an Instruction Non-defined Exception, so mark the above instruction as INSN_BUG type to fix the warning. Cc: stable@vger.kernel.org Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/arch/loongarch/include/asm/inst.h | 12 ++++++++++++ tools/objtool/arch/loongarch/decode.c | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h index c25b5853181d..d68fad63c8b7 100644 --- a/tools/arch/loongarch/include/asm/inst.h +++ b/tools/arch/loongarch/include/asm/inst.h @@ -51,6 +51,10 @@ enum reg2i16_op { bgeu_op = 0x1b, }; +enum reg3_op { + amswapw_op = 0x70c0, +}; + struct reg0i15_format { unsigned int immediate : 15; unsigned int opcode : 17; @@ -96,6 +100,13 @@ struct reg2i16_format { unsigned int opcode : 6; }; +struct reg3_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int rk : 5; + unsigned int opcode : 17; +}; + union loongarch_instruction { unsigned int word; struct reg0i15_format reg0i15_format; @@ -105,6 +116,7 @@ union loongarch_instruction { struct reg2i12_format reg2i12_format; struct reg2i14_format reg2i14_format; struct reg2i16_format reg2i16_format; + struct reg3_format reg3_format; }; #define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 428ba6de3821..2e555c4060c5 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -278,6 +278,25 @@ static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, return true; } +static bool decode_insn_reg3_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg3_format.opcode) { + case amswapw_op: + if (inst.reg3_format.rd == LOONGARCH_GPR_ZERO && + inst.reg3_format.rk == LOONGARCH_GPR_RA && + inst.reg3_format.rj == LOONGARCH_GPR_ZERO) { + /* amswap.w $zero, $ra, $zero */ + insn->type = INSN_BUG; + } + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) @@ -309,6 +328,8 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec return 0; if (decode_insn_reg2i16_fomat(inst, insn)) return 0; + if (decode_insn_reg3_fomat(inst, insn)) + return 0; if (inst.word == 0) { /* andi $zero, $zero, 0x0 */ From b15212824a01cb0b62f7b522f4ee334622cf982a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 407/450] LoongArch: Make LTO case independent in Makefile LTO is not only used for Clang, but maybe also used for Rust, make LTO case out of CONFIG_CC_HAS_ANNOTATE_TABLEJUMP in Makefile. This is preparation for later patch, no function changes. Cc: stable@vger.kernel.org Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index a3a9759414f4..9d80af7f75c8 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -102,16 +102,16 @@ KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma) ifdef CONFIG_OBJTOOL ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP +KBUILD_CFLAGS += -mannotate-tablejump +else +KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers +endif +ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. # Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to # be passed via '-mllvm' to ld.lld. -KBUILD_CFLAGS += -mannotate-tablejump -ifdef CONFIG_LTO_CLANG KBUILD_LDFLAGS += -mllvm --loongarch-annotate-tablejump endif -else -KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers -endif endif KBUILD_RUSTFLAGS += --target=loongarch64-unknown-none-softfloat -Ccode-model=small From 74f8295c6fb8436bec9995baf6ba463151b6fb68 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 408/450] LoongArch: Handle jump tables options for RUST When compiling with LLVM and CONFIG_RUST is set, there exist objtool warnings in rust/core.o and rust/kernel.o, like this: rust/core.o: warning: objtool: _RNvXs1_NtNtCs5QSdWC790r4_4core5ascii10ascii_charNtB5_9AsciiCharNtNtB9_3fmt5Debug3fmt+0x54: sibling call from callable instruction with modified stack frame For this special case, the related object file shows that there is no generated relocation section '.rela.discard.tablejump_annotate' for the table jump instruction jirl, thus objtool can not know that what is the actual destination address. If rustc has the option "-Cllvm-args=--loongarch-annotate-tablejump", pass the option to enable jump tables for objtool, otherwise it should pass "-Zno-jump-tables" to keep compatibility with older rustc. How to test: $ rustup component add rust-src $ make LLVM=1 rustavailable $ make ARCH=loongarch LLVM=1 clean defconfig $ scripts/config -d MODVERSIONS \ -e RUST -e SAMPLES -e SAMPLES_RUST \ -e SAMPLE_RUST_CONFIGFS -e SAMPLE_RUST_MINIMAL \ -e SAMPLE_RUST_MISC_DEVICE -e SAMPLE_RUST_PRINT \ -e SAMPLE_RUST_DMA -e SAMPLE_RUST_DRIVER_PCI \ -e SAMPLE_RUST_DRIVER_PLATFORM -e SAMPLE_RUST_DRIVER_FAUX \ -e SAMPLE_RUST_DRIVER_AUXILIARY -e SAMPLE_RUST_HOSTPROGS $ make ARCH=loongarch LLVM=1 olddefconfig all Cc: stable@vger.kernel.org Acked-by: Miguel Ojeda Reported-by: Miguel Ojeda Closes: https://lore.kernel.org/rust-for-linux/CANiq72mNeCuPkCDrG2db3w=AX+O-zYrfprisDPmRac_qh65Dmg@mail.gmail.com/ Suggested-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 4 ++++ arch/loongarch/Makefile | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac..57933a717e92 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -298,6 +298,10 @@ config AS_HAS_LVZ_EXTENSION config CC_HAS_ANNOTATE_TABLEJUMP def_bool $(cc-option,-mannotate-tablejump) +config RUSTC_HAS_ANNOTATE_TABLEJUMP + depends on RUST + def_bool $(rustc-option,-Cllvm-args=--loongarch-annotate-tablejump) + menu "Kernel type and options" source "kernel/Kconfig.hz" diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 9d80af7f75c8..ae419e32f22e 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -106,6 +106,11 @@ KBUILD_CFLAGS += -mannotate-tablejump else KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers endif +ifdef CONFIG_RUSTC_HAS_ANNOTATE_TABLEJUMP +KBUILD_RUSTFLAGS += -Cllvm-args=--loongarch-annotate-tablejump +else +KBUILD_RUSTFLAGS += -Zno-jump-tables # keep compatibility with older compilers +endif ifdef CONFIG_LTO_CLANG # The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. # Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to From f5003098e2f337d8e8a87dc636250e3fa978d9ad Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:43:42 +0800 Subject: [PATCH 409/450] LoongArch: Update help info of ARCH_STRICT_ALIGN Loongson-3A6000 and 3C6000 CPUs also support unaligned memory access, so the current description is out of date to some extent. Actually, all of Loongson-3 series processors based on LoongArch support unaligned memory access, this hardware capability is indicated by the bit 20 (UAL) of CPUCFG1 register, update the help info to reflect the reality. Cc: stable@vger.kernel.org Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 57933a717e92..0631a6b11281 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -567,10 +567,14 @@ config ARCH_STRICT_ALIGN -mstrict-align build parameter to prevent unaligned accesses. CPUs with h/w unaligned access support: - Loongson-2K2000/2K3000/3A5000/3C5000/3D5000. + Loongson-2K2000/2K3000 and all of Loongson-3 series processors + based on LoongArch. CPUs without h/w unaligned access support: - Loongson-2K500/2K1000. + Loongson-2K0300/2K0500/2K1000. + + If you want to make sure whether to support unaligned memory access + on your hardware, please read the bit 20 (UAL) of CPUCFG1 register. This option is enabled by default to make the kernel be able to run on all LoongArch systems. But you can disable it manually if you want From a9d13433fe17be0e867e51e71a1acd2731fbef8d Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 18 Sep 2025 19:44:01 +0800 Subject: [PATCH 410/450] LoongArch: Align ACPI structures if ARCH_STRICT_ALIGN enabled ARCH_STRICT_ALIGN is used for hardware without UAL, now it only control the -mstrict-align flag. However, ACPI structures are packed by default so will cause unaligned accesses. To avoid this, define ACPI_MISALIGNMENT_NOT_SUPPORTED in asm/acenv.h to align ACPI structures if ARCH_STRICT_ALIGN enabled. Cc: stable@vger.kernel.org Reported-by: Binbin Zhou Suggested-by: Xi Ruoyao Suggested-by: Jiaxun Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/acenv.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/acenv.h b/arch/loongarch/include/asm/acenv.h index 52f298f7293b..483c955f2ae5 100644 --- a/arch/loongarch/include/asm/acenv.h +++ b/arch/loongarch/include/asm/acenv.h @@ -10,9 +10,8 @@ #ifndef _ASM_LOONGARCH_ACENV_H #define _ASM_LOONGARCH_ACENV_H -/* - * This header is required by ACPI core, but we have nothing to fill in - * right now. Will be updated later when needed. - */ +#ifdef CONFIG_ARCH_STRICT_ALIGN +#define ACPI_MISALIGNMENT_NOT_SUPPORTED +#endif /* CONFIG_ARCH_STRICT_ALIGN */ #endif /* _ASM_LOONGARCH_ACENV_H */ From 51adb03e6b865c0c6790f29659ff52d56742de2e Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:04 +0800 Subject: [PATCH 411/450] LoongArch: Check the return value when creating kobj Add a check for the return value of kobject_create_and_add(), to ensure that the kobj allocation succeeds for later use. Cc: stable@vger.kernel.org Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/env.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index c0a5dc9aeae2..be309a71f204 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -109,6 +109,8 @@ static int __init boardinfo_init(void) struct kobject *loongson_kobj; loongson_kobj = kobject_create_and_add("loongson", firmware_kobj); + if (!loongson_kobj) + return -ENOMEM; return sysfs_create_file(loongson_kobj, &boardinfo_attr.attr); } From d6d69f0edde63b553345d4efaceb7daed89fe04c Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:04 +0800 Subject: [PATCH 412/450] LoongArch: Replace sprintf() with sysfs_emit() As Documentation/filesystems/sysfs.rst suggested, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. No functional change intended. Cc: stable@vger.kernel.org Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kernel/env.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c index be309a71f204..23bd5ae2212c 100644 --- a/arch/loongarch/kernel/env.c +++ b/arch/loongarch/kernel/env.c @@ -86,7 +86,7 @@ late_initcall(fdt_cpu_clk_init); static ssize_t boardinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, + return sysfs_emit(buf, "BIOS Information\n" "Vendor\t\t\t: %s\n" "Version\t\t\t: %s\n" From 677d4a52d4dc4a147d5e84af9ff207832578be70 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 18 Sep 2025 19:44:08 +0800 Subject: [PATCH 413/450] LoongArch: Fix unreliable stack for live patching When testing the kernel live patching with "modprobe livepatch-sample", there is a timeout over 15 seconds from "starting patching transition" to "patching complete". The dmesg command shows "unreliable stack" for user tasks in debug mode, here is one of the messages: livepatch: klp_try_switch_task: bash:1193 has an unreliable stack The "unreliable stack" is because it can not unwind from do_syscall() to its previous frame handle_syscall(). It should use fp to find the original stack top due to secondary stack in do_syscall(), but fp is not used for some other functions, then fp can not be restored by the next frame of do_syscall(), so it is necessary to save fp if task is not current, in order to get the stack top of do_syscall(). Here are the call chains: klp_enable_patch() klp_try_complete_transition() klp_try_switch_task() klp_check_and_switch_task() klp_check_stack() stack_trace_save_tsk_reliable() arch_stack_walk_reliable() When executing "rmmod livepatch-sample", there exists a similar issue. With this patch, it takes a short time for patching and unpatching. Before: # modprobe livepatch-sample # dmesg -T | tail -3 [Sat Sep 6 11:00:20 2025] livepatch: 'livepatch_sample': starting patching transition [Sat Sep 6 11:00:35 2025] livepatch: signaling remaining tasks [Sat Sep 6 11:00:36 2025] livepatch: 'livepatch_sample': patching complete # echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled # rmmod livepatch_sample rmmod: ERROR: Module livepatch_sample is in use # rmmod livepatch_sample # dmesg -T | tail -3 [Sat Sep 6 11:06:05 2025] livepatch: 'livepatch_sample': starting unpatching transition [Sat Sep 6 11:06:20 2025] livepatch: signaling remaining tasks [Sat Sep 6 11:06:21 2025] livepatch: 'livepatch_sample': unpatching complete After: # modprobe livepatch-sample # dmesg -T | tail -2 [Tue Sep 16 16:19:30 2025] livepatch: 'livepatch_sample': starting patching transition [Tue Sep 16 16:19:31 2025] livepatch: 'livepatch_sample': patching complete # echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled # rmmod livepatch_sample # dmesg -T | tail -2 [Tue Sep 16 16:19:36 2025] livepatch: 'livepatch_sample': starting unpatching transition [Tue Sep 16 16:19:37 2025] livepatch: 'livepatch_sample': unpatching complete Cc: stable@vger.kernel.org # v6.9+ Fixes: 199cc14cb4f1 ("LoongArch: Add kernel livepatching support") Reported-by: Xi Zhang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/stacktrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index 9a038d1070d7..387dc4d3c486 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -51,12 +51,13 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (task == current) { regs->regs[3] = (unsigned long)__builtin_frame_address(0); regs->csr_era = (unsigned long)__builtin_return_address(0); + regs->regs[22] = 0; } else { regs->regs[3] = thread_saved_fp(task); regs->csr_era = thread_saved_ra(task); + regs->regs[22] = task->thread.reg22; } regs->regs[1] = 0; - regs->regs[22] = 0; for (unwind_start(&state, task, regs); !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { From ac398f570724c41e5e039d54e4075519f6af7408 Mon Sep 17 00:00:00 2001 From: Guangshuo Li <202321181@mail.sdu.edu.cn> Date: Thu, 18 Sep 2025 19:44:10 +0800 Subject: [PATCH 414/450] LoongArch: vDSO: Check kcalloc() result in init_vdso() Add a NULL-pointer check after the kcalloc() call in init_vdso(). If allocation fails, return -ENOMEM to prevent a possible dereference of vdso_info.code_mapping.pages when it is NULL. Cc: stable@vger.kernel.org Fixes: 2ed119aef60d ("LoongArch: Set correct size for vDSO code mapping") Signed-off-by: Guangshuo Li <202321181@mail.sdu.edu.cn> Signed-off-by: Huacai Chen --- arch/loongarch/kernel/vdso.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 7b888d9085a0..dee1a15d7f4c 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -54,6 +54,9 @@ static int __init init_vdso(void) vdso_info.code_mapping.pages = kcalloc(vdso_info.size / PAGE_SIZE, sizeof(struct page *), GFP_KERNEL); + if (!vdso_info.code_mapping.pages) + return -ENOMEM; + pfn = __phys_to_pfn(__pa_symbol(vdso_info.vdso)); for (i = 0; i < vdso_info.size / PAGE_SIZE; i++) vdso_info.code_mapping.pages[i] = pfn_to_page(pfn + i); From 091b29d53fe645781c5c1f405bc9fcd50ce5792b Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 415/450] LoongArch: KVM: Remove unused returns and semicolons The default branch has already handled all undefined cases, so the final return statement is redundant. Redundant semicolons are removed, too. Cc: stable@vger.kernel.org Reviewed-by: Bibo Mao Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 2ce41f93b2a4..6c9c7de7226b 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -778,10 +778,8 @@ static long kvm_save_notify(struct kvm_vcpu *vcpu) return 0; default: return KVM_HCALL_INVALID_CODE; - }; - - return KVM_HCALL_INVALID_CODE; -}; + } +} /* * kvm_handle_lsx_disabled() - Guest used LSX while disabled in root. From f58c9aa1065f73d243904b267c71f6a9d1e9f90e Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 416/450] LoongArch: KVM: Fix VM migration failure with PTW enabled With PTW disabled system, bit _PAGE_DIRTY is a HW bit for page writing. However with PTW enabled system, bit _PAGE_WRITE is also a "HW bit" for page writing, because hardware synchronizes _PAGE_WRITE to _PAGE_DIRTY automatically. Previously, _PAGE_WRITE is treated as a SW bit to record the page writeable attribute for the fast page fault handling in the secondary MMU, however with PTW enabled machine, this bit is used by HW already (so setting it will silence the TLB modify exception). Here define KVM_PAGE_WRITEABLE with the SW bit _PAGE_MODIFIED, so that it can work on both PTW disabled and enabled machines. And for HW write bits, both _PAGE_DIRTY and _PAGE_WRITE are set or clear together. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_mmu.h | 20 ++++++++++++++++---- arch/loongarch/kvm/mmu.c | 8 ++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_mmu.h b/arch/loongarch/include/asm/kvm_mmu.h index 099bafc6f797..e36cc7e8ed20 100644 --- a/arch/loongarch/include/asm/kvm_mmu.h +++ b/arch/loongarch/include/asm/kvm_mmu.h @@ -16,6 +16,13 @@ */ #define KVM_MMU_CACHE_MIN_PAGES (CONFIG_PGTABLE_LEVELS - 1) +/* + * _PAGE_MODIFIED is a SW pte bit, it records page ever written on host + * kernel, on secondary MMU it records the page writeable attribute, in + * order for fast path handling. + */ +#define KVM_PAGE_WRITEABLE _PAGE_MODIFIED + #define _KVM_FLUSH_PGTABLE 0x1 #define _KVM_HAS_PGMASK 0x2 #define kvm_pfn_pte(pfn, prot) (((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) @@ -52,10 +59,10 @@ static inline void kvm_set_pte(kvm_pte_t *ptep, kvm_pte_t val) WRITE_ONCE(*ptep, val); } -static inline int kvm_pte_write(kvm_pte_t pte) { return pte & _PAGE_WRITE; } -static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & _PAGE_DIRTY; } static inline int kvm_pte_young(kvm_pte_t pte) { return pte & _PAGE_ACCESSED; } static inline int kvm_pte_huge(kvm_pte_t pte) { return pte & _PAGE_HUGE; } +static inline int kvm_pte_dirty(kvm_pte_t pte) { return pte & __WRITEABLE; } +static inline int kvm_pte_writeable(kvm_pte_t pte) { return pte & KVM_PAGE_WRITEABLE; } static inline kvm_pte_t kvm_pte_mkyoung(kvm_pte_t pte) { @@ -69,12 +76,12 @@ static inline kvm_pte_t kvm_pte_mkold(kvm_pte_t pte) static inline kvm_pte_t kvm_pte_mkdirty(kvm_pte_t pte) { - return pte | _PAGE_DIRTY; + return pte | __WRITEABLE; } static inline kvm_pte_t kvm_pte_mkclean(kvm_pte_t pte) { - return pte & ~_PAGE_DIRTY; + return pte & ~__WRITEABLE; } static inline kvm_pte_t kvm_pte_mkhuge(kvm_pte_t pte) @@ -87,6 +94,11 @@ static inline kvm_pte_t kvm_pte_mksmall(kvm_pte_t pte) return pte & ~_PAGE_HUGE; } +static inline kvm_pte_t kvm_pte_mkwriteable(kvm_pte_t pte) +{ + return pte | KVM_PAGE_WRITEABLE; +} + static inline int kvm_need_flush(kvm_ptw_ctx *ctx) { return ctx->flag & _KVM_FLUSH_PGTABLE; diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index ed956c5cf2cc..7c8143e79c12 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -569,7 +569,7 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ /* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); if (write && !kvm_pte_dirty(new)) { - if (!kvm_pte_write(new)) { + if (!kvm_pte_writeable(new)) { ret = -EFAULT; goto out; } @@ -856,9 +856,9 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) prot_bits |= _CACHE_SUC; if (writeable) { - prot_bits |= _PAGE_WRITE; + prot_bits = kvm_pte_mkwriteable(prot_bits); if (write) - prot_bits |= __WRITEABLE; + prot_bits = kvm_pte_mkdirty(prot_bits); } /* Disable dirty logging on HugePages */ @@ -904,7 +904,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) kvm_release_faultin_page(kvm, page, false, writeable); spin_unlock(&kvm->mmu_lock); - if (prot_bits & _PAGE_DIRTY) + if (kvm_pte_dirty(prot_bits)) mark_page_dirty_in_slot(kvm, memslot, gfn); out: From 47256c4c8b1bfbc63223a0da2d4fa90b6ede5cbb Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 417/450] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_ctrl_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() before spinlock context in function kvm_eiointc_ctrl_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_ctrl_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 026b139dcff2..2b3c5203738a 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -426,21 +426,26 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, struct loongarch_eiointc *s = dev->kvm->arch.eiointc; data = (void __user *)attr->addr; + switch (type) { + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: + case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: + if (copy_from_user(&val, data, 4)) + return -EFAULT; + break; + default: + break; + } + spin_lock_irqsave(&s->lock, flags); switch (type) { case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU: - if (copy_from_user(&val, data, 4)) - ret = -EFAULT; - else { - if (val >= EIOINTC_ROUTE_MAX_VCPUS) - ret = -EINVAL; - else - s->num_cpu = val; - } + if (val >= EIOINTC_ROUTE_MAX_VCPUS) + ret = -EINVAL; + else + s->num_cpu = val; break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_FEATURE: - if (copy_from_user(&s->features, data, 4)) - ret = -EFAULT; + s->features = val; if (!(s->features & BIT(EIOINTC_HAS_VIRT_EXTENSION))) s->status |= BIT(EIOINTC_ENABLE); break; From 62f11796a0dfa1a2ef5f50a2d1bc81c81628fb8e Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 418/450] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_regs_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() before spinlock context in function kvm_eiointc_ctrl_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_regs_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 33 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 2b3c5203738a..6455420c7737 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -467,19 +467,17 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, static int kvm_eiointc_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, cpu, offset, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; cpu = addr >> 16; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: offset = (addr - EIOINTC_NODETYPE_START) / 4; @@ -518,13 +516,10 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -581,9 +576,18 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, static int kvm_eiointc_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int ret, data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, false); + ret = kvm_eiointc_regs_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: return kvm_eiointc_sw_status_access(dev, attr, false); default: @@ -594,11 +598,16 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev, static int kvm_eiointc_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { + int data; + switch (attr->group) { case KVM_DEV_LOONGARCH_EXTIOI_GRP_CTRL: return kvm_eiointc_ctrl_access(dev, attr); case KVM_DEV_LOONGARCH_EXTIOI_GRP_REGS: - return kvm_eiointc_regs_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_regs_access(dev, attr, true, &data); case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: return kvm_eiointc_sw_status_access(dev, attr, true); default: From 01a8e68396a6d51f5ba92021ad1a4b8eaabdd0e7 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:22 +0800 Subject: [PATCH 419/450] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_eiointc_sw_status_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move funtcion calling of copy_from_user() and copy_to_user() out of function kvm_eiointc_sw_status_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_eiointc_sw_status_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: 1ad7efa552fd5 ("LoongArch: KVM: Add EIOINTC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index 6455420c7737..c32333695381 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -527,19 +527,17 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, static int kvm_eiointc_sw_status_access(struct kvm_device *dev, struct kvm_device_attr *attr, - bool is_write) + bool is_write, int *data) { int addr, ret = 0; unsigned long flags; void *p = NULL; - void __user *data; struct loongarch_eiointc *s; s = dev->kvm->arch.eiointc; addr = attr->attr; addr &= 0xffff; - data = (void __user *)attr->addr; switch (addr) { case KVM_DEV_LOONGARCH_EXTIOI_SW_STATUS_NUM_CPU: if (is_write) @@ -561,13 +559,10 @@ static int kvm_eiointc_sw_status_access(struct kvm_device *dev, return -EINVAL; } spin_lock_irqsave(&s->lock, flags); - if (is_write) { - if (copy_from_user(p, data, 4)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, 4)) - ret = -EFAULT; - } + if (is_write) + memcpy(p, data, 4); + else + memcpy(data, p, 4); spin_unlock_irqrestore(&s->lock, flags); return ret; @@ -589,7 +584,14 @@ static int kvm_eiointc_get_attr(struct kvm_device *dev, return ret; case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, false); + ret = kvm_eiointc_sw_status_access(dev, attr, false, &data); + if (ret) + return ret; + + if (copy_to_user((void __user *)attr->addr, &data, 4)) + ret = -EFAULT; + + return ret; default: return -EINVAL; } @@ -609,7 +611,10 @@ static int kvm_eiointc_set_attr(struct kvm_device *dev, return kvm_eiointc_regs_access(dev, attr, true, &data); case KVM_DEV_LOONGARCH_EXTIOI_GRP_SW_STATUS: - return kvm_eiointc_sw_status_access(dev, attr, true); + if (copy_from_user(&data, (void __user *)attr->addr, 4)) + return -EFAULT; + + return kvm_eiointc_sw_status_access(dev, attr, true, &data); default: return -EINVAL; } From 8dc5245673cf7f33743e5c0d2a4207c0b8df3067 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 18 Sep 2025 19:44:25 +0800 Subject: [PATCH 420/450] LoongArch: KVM: Avoid copy_*_user() with lock hold in kvm_pch_pic_regs_access() Function copy_from_user() and copy_to_user() may sleep because of page fault, and they cannot be called in spin_lock hold context. Here move function calling of copy_from_user() and copy_to_user() out of spinlock context in function kvm_pch_pic_regs_access(). Otherwise there will be possible warning such as: BUG: sleeping function called from invalid context at include/linux/uaccess.h:192 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 6292, name: qemu-system-loo preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last enabled at (0): [<9000000004c4a554>] copy_process+0x90c/0x1d40 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 41 UID: 0 PID: 6292 Comm: qemu-system-loo Tainted: G W 6.17.0-rc3+ #31 PREEMPT(full) Tainted: [W]=WARN Stack : 0000000000000076 0000000000000000 9000000004c28264 9000100092ff4000 9000100092ff7b80 9000100092ff7b88 0000000000000000 9000100092ff7cc8 9000100092ff7cc0 9000100092ff7cc0 9000100092ff7a00 0000000000000001 0000000000000001 9000100092ff7b88 947d2f9216a5e8b9 900010008773d880 00000000ffff8b9f fffffffffffffffe 0000000000000ba1 fffffffffffffffe 000000000000003e 900000000825a15b 000010007ad38000 9000100092ff7ec0 0000000000000000 0000000000000000 9000000006f3ac60 9000000007252000 0000000000000000 00007ff746ff2230 0000000000000053 9000200088a021b0 0000555556c9d190 0000000000000000 9000000004c2827c 000055556cfb5f40 00000000000000b0 0000000000000007 0000000000000007 0000000000071c1d Call Trace: [<9000000004c2827c>] show_stack+0x5c/0x180 [<9000000004c20fac>] dump_stack_lvl+0x94/0xe4 [<9000000004c99c7c>] __might_resched+0x26c/0x290 [<9000000004f68968>] __might_fault+0x20/0x88 [] kvm_pch_pic_regs_access.isra.0+0x88/0x380 [kvm] [] kvm_device_ioctl+0x194/0x290 [kvm] [<900000000506b0d8>] sys_ioctl+0x388/0x1010 [<90000000063ed210>] do_syscall+0xb0/0x2d8 [<9000000004c25ef8>] handle_syscall+0xb8/0x158 Cc: stable@vger.kernel.org Fixes: d206d95148732 ("LoongArch: KVM: Add PCHPIC user mode read and write functions") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index 119290bcea79..baf3b4faf7ea 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -348,6 +348,7 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { + char buf[8]; int addr, offset, len = 8, ret = 0; void __user *data; void *p = NULL; @@ -397,17 +398,23 @@ static int kvm_pch_pic_regs_access(struct kvm_device *dev, return -EINVAL; } - spin_lock(&s->lock); - /* write or read value according to is_write */ if (is_write) { - if (copy_from_user(p, data, len)) - ret = -EFAULT; - } else { - if (copy_to_user(data, p, len)) - ret = -EFAULT; + if (copy_from_user(buf, data, len)) + return -EFAULT; } + + spin_lock(&s->lock); + if (is_write) + memcpy(p, buf, len); + else + memcpy(buf, p, len); spin_unlock(&s->lock); + if (!is_write) { + if (copy_to_user(data, buf, len)) + return -EFAULT; + } + return ret; } From 3fbfe251cc9f6d391944282cdb9bcf0bd02e01f8 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 17 Sep 2025 16:48:54 +0300 Subject: [PATCH 421/450] Revert "net/mlx5e: Update and set Xon/Xoff upon port speed set" This reverts commit d24341740fe48add8a227a753e68b6eedf4b385a. It causes errors when trying to configure QoS, as well as loss of L2 connectivity (on multi-host devices). Reported-by: Jakub Kicinski Link: https://lore.kernel.org/20250910170011.70528106@kernel.org Fixes: d24341740fe4 ("net/mlx5e: Update and set Xon/Xoff upon port speed set") Signed-off-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e680673ffb72..15eded36b872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -139,8 +139,6 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); - mlx5e_port_manual_buffer_config(priv, 0, priv->netdev->mtu, - NULL, NULL, NULL); } else { netdev_info(priv->netdev, "Link down\n"); netif_carrier_off(priv->netdev); From 87ebb628a5acb892eba41ef1d8989beb8f036034 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2025 13:53:37 +0000 Subject: [PATCH 422/450] net: clear sk->sk_ino in sk_set_socket(sk, NULL) Andrei Vagin reported that blamed commit broke CRIU. Indeed, while we want to keep sk_uid unchanged when a socket is cloned, we want to clear sk->sk_ino. Otherwise, sock_diag might report multiple sockets sharing the same inode number. Move the clearing part from sock_orphan() to sk_set_socket(sk, NULL), called both from sock_orphan() and sk_clone_lock(). Fixes: 5d6b58c932ec ("net: lockless sock_i_ino()") Closes: https://lore.kernel.org/netdev/aMhX-VnXkYDpKd9V@google.com/ Closes: https://github.com/checkpoint-restore/criu/issues/2744 Reported-by: Andrei Vagin Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Link: https://patch.msgid.link/20250917135337.1736101-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fc..2e14283c5be1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } From cca7b1cfd7b8a0eff2a3510c5e0f10efe8fa3758 Mon Sep 17 00:00:00 2001 From: Alexey Nepomnyashih Date: Wed, 17 Sep 2025 15:30:58 +0000 Subject: [PATCH 423/450] net: liquidio: fix overflow in octeon_init_instr_queue() The expression `(conf->instr_type == 64) << iq_no` can overflow because `iq_no` may be as high as 64 (`CN23XX_MAX_RINGS_PER_PF`). Casting the operand to `u64` ensures correct 64-bit arithmetic. Fixes: f21fb3ed364b ("Add support of Cavium Liquidio ethernet adapters") Signed-off-by: Alexey Nepomnyashih Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index de8a6ce86ad7..12105ffb5dac 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -126,7 +126,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, oct->io_qmask.iq |= BIT_ULL(iq_no); /* Set the 32B/64B mode for each input queue */ - oct->io_qmask.iq64B |= ((conf->instr_type == 64) << iq_no); + oct->io_qmask.iq64B |= ((u64)(conf->instr_type == 64) << iq_no); iq->iqcmd_64B = (conf->instr_type == 64); oct->fn_list.setup_iq_regs(oct, iq_no); From 7736aff4704188d4483b7b36ff06d201c8be4844 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 18 Sep 2025 12:15:56 +0300 Subject: [PATCH 424/450] MAINTAINERS: update sundance entry Signed-off-by: Denis Kirjanov Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 086b4a7bd04f..70c9d368c8d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24255,7 +24255,7 @@ F: Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml F: drivers/input/keyboard/sun4i-lradc-keys.c SUNDANCE NETWORK DRIVER -M: Denis Kirjanov +M: Denis Kirjanov L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/dlink/sundance.c From 3191df0a4882c827cac29925e80ecb1775b904bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 18 Sep 2025 13:15:06 +0300 Subject: [PATCH 425/450] devlink rate: Remove unnecessary 'static' from a couple places devlink_rate_node_get_by_name() and devlink_rate_nodes_destroy() have a couple of unnecessary static variables for iterating over devlink rates. This could lead to races/corruption/unhappiness if two concurrent operations execute the same function. Remove 'static' from both. It's amazing this was missed for 4+ years. While at it, I confirmed there are no more examples of this mistake in net/ with 1, 2 or 3 levels of indentation. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Cosmin Ratiu Signed-off-by: Jakub Kicinski --- net/devlink/rate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1..264fb82cba19 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); From cfa7d9b1e3a8604afc84e9e51d789c29574fb216 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 13:46:02 +0800 Subject: [PATCH 426/450] cnic: Fix use-after-free bugs in cnic_delete_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original code uses cancel_delayed_work() in cnic_cm_stop_bnx2x_hw(), which does not guarantee that the delayed work item 'delete_task' has fully completed if it was already running. Additionally, the delayed work item is cyclic, the flush_workqueue() in cnic_cm_stop_bnx2x_hw() only blocks and waits for work items that were already queued to the workqueue prior to its invocation. Any work items submitted after flush_workqueue() is called are not included in the set of tasks that the flush operation awaits. This means that after the cyclic work items have finished executing, a delayed work item may still exist in the workqueue. This leads to use-after-free scenarios where the cnic_dev is deallocated by cnic_free_dev(), while delete_task remains active and attempt to dereference cnic_dev in cnic_delete_task(). A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) cnic_netdev_event() | cnic_stop_hw() | cnic_delete_task() cnic_cm_stop_bnx2x_hw() | ... cancel_delayed_work() | /* the queue_delayed_work() flush_workqueue() | executes after flush_workqueue()*/ | queue_delayed_work() cnic_free_dev(dev)//free | cnic_delete_task() //new instance | dev = cp->dev; //use Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the cyclic delayed work item is properly canceled and that any ongoing execution of the work item completes before the cnic_dev is deallocated. Furthermore, since cancel_delayed_work_sync() uses __flush_work(work, true) to synchronously wait for any currently executing instance of the work item to finish, the flush_workqueue() becomes redundant and should be removed. This bug was identified through static analysis. To reproduce the issue and validate the fix, I simulated the cnic PCI device in QEMU and introduced intentional delays — such as inserting calls to ssleep() within the cnic_delete_task() function — to increase the likelihood of triggering the bug. Fixes: fdf24086f475 ("cnic: Defer iscsi connection cleanup") Signed-off-by: Duoming Zhou Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/cnic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index a9040c42d2ff..6e97a5a7daaf 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4230,8 +4230,7 @@ static void cnic_cm_stop_bnx2x_hw(struct cnic_dev *dev) cnic_bnx2x_delete_wait(dev, 0); - cancel_delayed_work(&cp->delete_task); - flush_workqueue(cnic_wq); + cancel_delayed_work_sync(&cp->delete_task); if (atomic_read(&cp->iscsi_conn) != 0) netdev_warn(dev->netdev, "%d iSCSI connections not destroyed\n", From f8b4687151021db61841af983f1cb7be6915d4ef Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 14:38:53 +0800 Subject: [PATCH 427/450] octeontx2-pf: Fix use-after-free bugs in otx2_sync_tstamp() The original code relies on cancel_delayed_work() in otx2_ptp_destroy(), which does not ensure that the delayed work item synctstamp_work has fully completed if it was already running. This leads to use-after-free scenarios where otx2_ptp is deallocated by otx2_ptp_destroy(), while synctstamp_work remains active and attempts to dereference otx2_ptp in otx2_sync_tstamp(). Furthermore, the synctstamp_work is cyclic, the likelihood of triggering the bug is nonnegligible. A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) otx2_remove() | otx2_ptp_destroy() | otx2_sync_tstamp() cancel_delayed_work() | kfree(ptp) | | ptp = container_of(...); //UAF | ptp-> //UAF This is confirmed by a KASAN report: BUG: KASAN: slab-use-after-free in __run_timer_base.part.0+0x7d7/0x8c0 Write of size 8 at addr ffff88800aa09a18 by task bash/136 ... Call Trace: dump_stack_lvl+0x55/0x70 print_report+0xcf/0x610 ? __run_timer_base.part.0+0x7d7/0x8c0 kasan_report+0xb8/0xf0 ? __run_timer_base.part.0+0x7d7/0x8c0 __run_timer_base.part.0+0x7d7/0x8c0 ? __pfx___run_timer_base.part.0+0x10/0x10 ? __pfx_read_tsc+0x10/0x10 ? ktime_get+0x60/0x140 ? lapic_next_event+0x11/0x20 ? clockevents_program_event+0x1d4/0x2a0 run_timer_softirq+0xd1/0x190 handle_softirqs+0x16a/0x550 irq_exit_rcu+0xaf/0xe0 sysvec_apic_timer_interrupt+0x70/0x80 ... Allocated by task 1: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0x7f/0x90 otx2_ptp_init+0xb1/0x860 otx2_probe+0x4eb/0xc30 local_pci_probe+0xdc/0x190 pci_device_probe+0x2fe/0x470 really_probe+0x1ca/0x5c0 __driver_probe_device+0x248/0x310 driver_probe_device+0x44/0x120 __driver_attach+0xd2/0x310 bus_for_each_dev+0xed/0x170 bus_add_driver+0x208/0x500 driver_register+0x132/0x460 do_one_initcall+0x89/0x300 kernel_init_freeable+0x40d/0x720 kernel_init+0x1a/0x150 ret_from_fork+0x10c/0x1a0 ret_from_fork_asm+0x1a/0x30 Freed by task 136: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3a/0x60 __kasan_slab_free+0x3f/0x50 kfree+0x137/0x370 otx2_ptp_destroy+0x38/0x80 otx2_remove+0x10d/0x4c0 pci_device_remove+0xa6/0x1d0 device_release_driver_internal+0xf8/0x210 pci_stop_bus_device+0x105/0x150 pci_stop_and_remove_bus_device_locked+0x15/0x30 remove_store+0xcc/0xe0 kernfs_fop_write_iter+0x2c3/0x440 vfs_write+0x871/0xd70 ksys_write+0xee/0x1c0 do_syscall_64+0xac/0x280 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the delayed work item is properly canceled before the otx2_ptp is deallocated. This bug was initially identified through static analysis. To reproduce and test it, I simulated the OcteonTX2 PCI device in QEMU and introduced artificial delays within the otx2_sync_tstamp() function to increase the likelihood of triggering the bug. Fixes: 2958d17a8984 ("octeontx2-pf: Add support for ptp 1-step mode on CN10K silicon") Signed-off-by: Duoming Zhou Reviewed-by: Vadim Fedorenko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c index e52cc6b1a26c..dedd586ed310 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c @@ -491,7 +491,7 @@ void otx2_ptp_destroy(struct otx2_nic *pfvf) if (!ptp) return; - cancel_delayed_work(&pfvf->ptp->synctstamp_work); + cancel_delayed_work_sync(&pfvf->ptp->synctstamp_work); ptp_clock_unregister(ptp->ptp_clock); kfree(ptp); From 3539b1467e94336d5854ebf976d9627bfb65d6c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 10:21:14 -0600 Subject: [PATCH 428/450] io_uring: include dying ring in task_work "should cancel" state When running task_work for an exiting task, rather than perform the issue retry attempt, the task_work is canceled. However, this isn't done for a ring that has been closed. This can lead to requests being successfully completed post the ring being closed, which is somewhat confusing and surprising to an application. Rather than just check the task exit state, also include the ring ref state in deciding whether or not to terminate a given request when run from task_work. Cc: stable@vger.kernel.org # 6.1+ Link: https://github.com/axboe/liburing/discussions/1459 Reported-by: Benedek Thaler Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++++-- io_uring/io_uring.h | 4 ++-- io_uring/poll.c | 2 +- io_uring/timeout.c | 2 +- io_uring/uring_cmd.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a165..bcec12256f34 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1406,8 +1406,10 @@ static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, tw); - if (unlikely(io_should_terminate_tw())) + struct io_ring_ctx *ctx = req->ctx; + + io_tw_lock(ctx, tw); + if (unlikely(io_should_terminate_tw(ctx))) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..1880902be6fd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -476,9 +476,9 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) * 2) PF_KTHREAD is set, in which case the invoker of the task_work is * our fallback task_work. */ -static inline bool io_should_terminate_tw(void) +static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) { - return current->flags & (PF_KTHREAD | PF_EXITING); + return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) diff --git a/io_uring/poll.c b/io_uring/poll.c index c786e587563b..6090a26975d4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -224,7 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - if (unlikely(io_should_terminate_tw())) + if (unlikely(io_should_terminate_tw(req->ctx))) return -ECANCELED; do { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7f13bfa9f2b6..17e3aab0af36 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -324,7 +324,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) int ret; if (prev) { - if (!io_should_terminate_tw()) { + if (!io_should_terminate_tw(req->ctx)) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..213716e10d70 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -118,7 +118,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (io_should_terminate_tw()) + if (io_should_terminate_tw(req->ctx)) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ From 2ade36eaa9ac05e4913e9785df19c2cde8f912fb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 17 Sep 2025 12:42:09 -0400 Subject: [PATCH 429/450] drm/amdkfd: add proper handling for S0ix When in S0i3, the GFX state is retained, so all we need to do is stop the runlist so GFX can enter gfxoff. Reviewed-by: Mario Limonciello (AMD) Tested-by: David Perry Signed-off-by: Alex Deucher (cherry picked from commit 4bfa8609934dbf39bbe6e75b4f971469384b50b1) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 12 ++++++++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 36 ++++++++++++++++++++++ 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index fbe7616555c8..a2879d2b7c8e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -250,16 +250,24 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc) { - if (adev->kfd.dev) - kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + kgd2kfd_stop_sched_all_nodes(adev->kfd.dev); + else + kgd2kfd_suspend(adev->kfd.dev, suspend_proc); + } } int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc) { int r = 0; - if (adev->kfd.dev) - r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + if (adev->kfd.dev) { + if (adev->in_s0ix) + r = kgd2kfd_start_sched_all_nodes(adev->kfd.dev); + else + r = kgd2kfd_resume(adev->kfd.dev, resume_proc); + } return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 33eb4826b58b..aa88bad7416b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -426,7 +426,9 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask); int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd); void kgd2kfd_unlock_kfd(struct kfd_dev *kfd); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd); bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, bool retry_fault); @@ -516,11 +518,21 @@ static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return 0; } +static inline int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { return 0; } +static inline int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + return 0; +} + static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { return false; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 7e749f9b6d69..349c351e242b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1550,6 +1550,25 @@ int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id) return ret; } +int kgd2kfd_start_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.unhalt(node->dqm); + if (r) { + dev_err(kfd_device, "Error in starting scheduler\n"); + return r; + } + } + return 0; +} + int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; @@ -1567,6 +1586,23 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) return node->dqm->ops.halt(node->dqm); } +int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd) +{ + struct kfd_node *node; + int i, r; + + if (!kfd->init_complete) + return 0; + + for (i = 0; i < kfd->num_nodes; i++) { + node = kfd->nodes[i]; + r = node->dqm->ops.halt(node->dqm); + if (r) + return r; + } + return 0; +} + bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) { struct kfd_node *node; From 9272bb34b066993f5f468b219b4a26ba3f2b25a1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 17 Sep 2025 12:42:11 -0400 Subject: [PATCH 430/450] drm/amdgpu: suspend KFD and KGD user queues for S0ix We need to make sure the user queues are preempted so GFX can enter gfxoff. Reviewed-by: Mario Limonciello (AMD) Tested-by: David Perry Signed-off-by: Alex Deucher (cherry picked from commit f8b367e6fa1716cab7cc232b9e3dff29187fc99d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 +++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 01d234cf8156..c8459337fcb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5136,7 +5136,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) adev->in_suspend = true; if (amdgpu_sriov_vf(adev)) { - if (!adev->in_s0ix && !adev->in_runpm) + if (!adev->in_runpm) amdgpu_amdkfd_suspend_process(adev); amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); @@ -5156,10 +5156,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) amdgpu_device_ip_suspend_phase1(adev); - if (!adev->in_s0ix) { - amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - amdgpu_userq_suspend(adev); - } + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + amdgpu_userq_suspend(adev); r = amdgpu_device_evict_resources(adev); if (r) @@ -5254,15 +5252,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) goto exit; } - if (!adev->in_s0ix) { - r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); - if (r) - goto exit; + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); + if (r) + goto exit; - r = amdgpu_userq_resume(adev); - if (r) - goto exit; - } + r = amdgpu_userq_resume(adev); + if (r) + goto exit; r = amdgpu_device_ip_late_init(adev); if (r) @@ -5275,7 +5271,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); - if (!adev->in_s0ix && !r && !adev->in_runpm) + if (!r && !adev->in_runpm) r = amdgpu_amdkfd_resume_process(adev); } From d33c3471047fc54966621d19329e6a23ebc8ec50 Mon Sep 17 00:00:00 2001 From: Praful Adiga Date: Thu, 18 Sep 2025 12:40:18 -0400 Subject: [PATCH 431/450] ALSA: hda/realtek: Fix mute led for HP Laptop 15-dw4xx This laptop uses the ALC236 codec with COEF 0x7 and idx 1 to control the mute LED. Enable the existing quirk for this device. Signed-off-by: Praful Adiga Cc: Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 618ec00d3f98..f267437c9698 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -6476,6 +6476,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8992, "HP EliteBook 845 G9", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8994, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8995, "HP EliteBook 855 G9", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x89a0, "HP Laptop 15-dw4xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x89a4, "HP ProBook 440 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89a6, "HP ProBook 450 G9", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89aa, "HP EliteBook 630 G9", ALC236_FIXUP_HP_GPIO_LED), From df8922afc37aa2111ca79a216653a629146763ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 13:59:15 -0600 Subject: [PATCH 432/450] io_uring/msg_ring: kill alloc_cache for io_kiocb allocations A recent commit: fc582cd26e88 ("io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU") fixed an issue with not deferring freeing of io_kiocb structs that msg_ring allocates to after the current RCU grace period. But this only covers requests that don't end up in the allocation cache. If a request goes into the alloc cache, it can get reused before it is sane to do so. A recent syzbot report would seem to indicate that there's something there, however it may very well just be because of the KASAN poisoning that the alloc_cache handles manually. Rather than attempt to make the alloc_cache sane for that use case, just drop the usage of the alloc_cache for msg_ring request payload data. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Link: https://lore.kernel.org/io-uring/68cc2687.050a0220.139b6.0005.GAE@google.com/ Reported-by: syzbot+baa2e0f4e02df602583e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- io_uring/io_uring.c | 4 ---- io_uring/msg_ring.c | 24 ++---------------------- 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..12f5ee43850e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcec12256f34..93665cebe9bd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb..5e5b94236d72 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; From ef442fc5c1a9a2a232de85a0e6967f388b6c0c8e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 11 Sep 2025 11:57:44 -0400 Subject: [PATCH 433/450] rv: Add Gabriele Monaco as maintainer for Runtime Verification Gabriele will start taking over managing the changes to the Runtime Verification. Make him officially one of the maintainers. Cc: Gabriele Monaco Cc: Linus Torvalds Link: https://lore.kernel.org/20250911115744.66ccade3@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cd7ff55b5d32..17073c075bf7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22048,6 +22048,7 @@ F: drivers/infiniband/ulp/rtrs/ RUNTIME VERIFICATION (RV) M: Steven Rostedt +M: Gabriele Monaco L: linux-trace-kernel@vger.kernel.org S: Maintained F: Documentation/trace/rv/ From 251090e2c2c1be60607d1c521af2c993f04d4f61 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 18 Sep 2025 12:30:32 -0300 Subject: [PATCH 434/450] smb: client: fix file open check in __cifs_unlink() Fix the file open check to decide whether or not silly-rename the file in SMB2+. Fixes: c5ea3065586d ("smb: client: fix data loss due to broken rename(2)") Signed-off-by: Paulo Alcantara (Red Hat) Cc: Frank Sorenson Reviewed-by: David Howells Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/inode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 1703f1285d36..0f0d2dae6283 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2003,8 +2003,21 @@ static int __cifs_unlink(struct inode *dir, struct dentry *dentry, bool sillyren goto psx_del_no_retry; } - if (sillyrename || (server->vals->protocol_id > SMB10_PROT_ID && - d_is_positive(dentry) && d_count(dentry) > 2)) + /* For SMB2+, if the file is open, we always perform a silly rename. + * + * We check for d_count() right after calling + * cifs_close_deferred_file_under_dentry() to make sure that the + * dentry's refcount gets dropped in case the file had any deferred + * close. + */ + if (!sillyrename && server->vals->protocol_id > SMB10_PROT_ID) { + spin_lock(&dentry->d_lock); + if (d_count(dentry) > 1) + sillyrename = true; + spin_unlock(&dentry->d_lock); + } + + if (sillyrename) rc = -EBUSY; else rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry); From daac51c7032036a0ca5f1aa419ad1b0471d1c6e0 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Thu, 18 Sep 2025 03:06:46 +0200 Subject: [PATCH 435/450] smb: client: fix smbdirect_recv_io leak in smbd_negotiate() error path During tests of another unrelated patch I was able to trigger this error: Objects remaining on __kmem_cache_shutdown() Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: f198186aa9bb ("CIFS: SMBD: Establish SMB Direct connection") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 1f3d64145863..e0fce5033004 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1108,8 +1108,10 @@ static int smbd_negotiate(struct smbd_connection *info) log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", rc, response->sge.addr, response->sge.length, response->sge.lkey); - if (rc) + if (rc) { + put_receive_buffer(info, response); return rc; + } init_completion(&info->negotiate_completion); info->negotiate_done = false; From 1e56310b40fd2e7e0b9493da9ff488af145bdd0c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Sat, 13 Sep 2025 06:26:57 +0000 Subject: [PATCH 436/450] iommu/amd/pgtbl: Fix possible race while increase page table level The AMD IOMMU host page table implementation supports dynamic page table levels (up to 6 levels), starting with a 3-level configuration that expands based on IOVA address. The kernel maintains a root pointer and current page table level to enable proper page table walks in alloc_pte()/fetch_pte() operations. The IOMMU IOVA allocator initially starts with 32-bit address and onces its exhuasted it switches to 64-bit address (max address is determined based on IOMMU and device DMA capability). To support larger IOVA, AMD IOMMU driver increases page table level. But in unmap path (iommu_v1_unmap_pages()), fetch_pte() reads pgtable->[root/mode] without lock. So its possible that in exteme corner case, when increase_address_space() is updating pgtable->[root/mode], fetch_pte() reads wrong page table level (pgtable->mode). It does compare the value with level encoded in page table and returns NULL. This will result is iommu_unmap ops to fail and upper layer may retry/log WARN_ON. CPU 0 CPU 1 ------ ------ map pages unmap pages alloc_pte() -> increase_address_space() iommu_v1_unmap_pages() -> fetch_pte() pgtable->root = pte (new root value) READ pgtable->[mode/root] Reads new root, old mode Updates mode (pgtable->mode += 1) Since Page table level updates are infrequent and already synchronized with a spinlock, implement seqcount to enable lock-free read operations on the read path. Fixes: 754265bcab7 ("iommu/amd: Fix race in increase_address_space()") Reported-by: Alejandro Jimenez Cc: stable@vger.kernel.org Cc: Joao Martins Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/io_pgtable.c | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 5219d7ddfdaa..95f63c5f6159 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -555,6 +555,7 @@ struct gcr3_tbl_info { }; struct amd_io_pgtable { + seqcount_t seqcount; /* Protects root/mode update */ struct io_pgtable pgtbl; int mode; u64 *root; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index a91e71f981ef..70c2f5b1631b 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -130,8 +131,11 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); + write_seqcount_begin(&pgtable->seqcount); pgtable->root = pte; pgtable->mode += 1; + write_seqcount_end(&pgtable->seqcount); + amd_iommu_update_and_flush_device_table(domain); pte = NULL; @@ -153,6 +157,7 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, { unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; + unsigned int seqcount; int level, end_lvl; u64 *pte, *page; @@ -170,8 +175,14 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, } - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + + address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -249,6 +260,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, unsigned long *page_size) { int level; + unsigned int seqcount; u64 *pte; *page_size = 0; @@ -256,8 +268,12 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, if (address > PM_LEVEL_SIZE(pgtable->mode)) return NULL; - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + *page_size = PTE_LEVEL_PAGE_SIZE(level); while (level > 0) { @@ -541,6 +557,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo if (!pgtable->root) return NULL; pgtable->mode = PAGE_MODE_3_LEVEL; + seqcount_init(&pgtable->seqcount); cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; From 2c139a47eff8de24e3350dadb4c9d5e3426db826 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Fri, 19 Sep 2025 17:03:52 +0800 Subject: [PATCH 437/450] io_uring: fix incorrect io_kiocb reference in io_link_skb In io_link_skb function, there is a bug where prev_notif is incorrectly assigned using 'nd' instead of 'prev_nd'. This causes the context validation check to compare the current notification with itself instead of comparing it with the previous notification. Fix by using the correct prev_nd parameter when obtaining prev_notif. Signed-off-by: Yang Xiuwei Reviewed-by: Pavel Begunkov Fixes: 6fe4220912d19 ("io_uring/notif: implement notification stacking") Signed-off-by: Jens Axboe --- io_uring/notif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index 9a6f6e92d742..ea9c0116cec2 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -85,7 +85,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) return -EEXIST; prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); - prev_notif = cmd_to_io_kiocb(nd); + prev_notif = cmd_to_io_kiocb(prev_nd); /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || From 853a57ba263adfecf4430b936d6862bc475b4bb5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 20 Sep 2025 11:51:48 +0900 Subject: [PATCH 438/450] firewire: core: fix overlooked update of subsystem ABI version In kernel v6.5, several functions were added to the cdev layer. This required updating the default version of subsystem ABI up to 6, but this requirement was overlooked. This commit updates the version accordingly. Fixes: 6add87e9764d ("firewire: cdev: add new version of ABI to notify time stamp at request/response subaction of transaction#") Link: https://lore.kernel.org/r/20250920025148.163402-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 78b10c6ef7fe..2e93189d7142 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -41,7 +41,7 @@ /* * ABI version history is documented in linux/firewire-cdev.h. */ -#define FW_CDEV_KERNEL_VERSION 5 +#define FW_CDEV_KERNEL_VERSION 6 #define FW_CDEV_VERSION_EVENT_REQUEST2 4 #define FW_CDEV_VERSION_ALLOCATE_REGION_END 4 #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 From 07e27ad16399afcd693be20211b0dfae63e0615f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Sep 2025 15:08:52 -0700 Subject: [PATCH 439/450] Linux 6.17-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9771619ac596..10355ecf32cb 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* From 7109f51bcc80b79a8f31e50fa5c96a20c54197d9 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:08 +0800 Subject: [PATCH 440/450] LoongArch: KVM: Add PTW feature detection on new hardware With new Loongson-3A6000/3C6000 hardware platforms (or later), hardware page table walking (PTW) feature is supported on host. So here add this feature detection on KVM host. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/uapi/asm/kvm.h | 1 + arch/loongarch/kvm/vcpu.c | 2 ++ arch/loongarch/kvm/vm.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index 5f354f5c6847..57ba1a563bb1 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -103,6 +103,7 @@ struct kvm_fpu { #define KVM_LOONGARCH_VM_FEAT_PMU 5 #define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 #define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 +#define KVM_LOONGARCH_VM_FEAT_PTW 8 /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index ce478151466c..9c802f7103c6 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -680,6 +680,8 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) *v |= CPUCFG2_ARMBT; if (cpu_has_lbt_mips) *v |= CPUCFG2_MIPSBT; + if (cpu_has_ptw) + *v |= CPUCFG2_PTW; return 0; case LOONGARCH_CPUCFG3: diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index edccfc8c9cd8..a49b1c1a3dd1 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -146,6 +146,10 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr if (kvm_pvtime_supported()) return 0; return -ENXIO; + case KVM_LOONGARCH_VM_FEAT_PTW: + if (cpu_has_ptw) + return 0; + return -ENXIO; default: return -ENXIO; } From 80edf90831a2c129db478b1e72b27839602b72ea Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:08 +0800 Subject: [PATCH 441/450] LoongArch: KVM: Add sign extension with kernel MMIO read emulation Function kvm_complete_mmio_read() is to add sign extension with MMIO read emulation, it is used in user space MMIO read completion now. Also it should be used in kernel MMIO read emulation. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 6c9c7de7226b..45833fbe16bf 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -468,6 +468,8 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) if (ret == EMULATE_DO_MMIO) { trace_kvm_mmio(KVM_TRACE_MMIO_READ, run->mmio.len, run->mmio.phys_addr, NULL); + vcpu->arch.io_gpr = rd; /* Set for kvm_complete_mmio_read() use */ + /* * If mmio device such as PCH-PIC is emulated in KVM, * it need not return to user space to handle the mmio @@ -475,16 +477,15 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) */ idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, - run->mmio.len, &vcpu->arch.gprs[rd]); + run->mmio.len, run->mmio.data); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (!ret) { + kvm_complete_mmio_read(vcpu, run); update_pc(&vcpu->arch); vcpu->mmio_needed = 0; return EMULATE_DONE; } - /* Set for kvm_complete_mmio_read() use */ - vcpu->arch.io_gpr = rd; run->mmio.is_write = 0; vcpu->mmio_is_write = 0; return EMULATE_DO_MMIO; From 44598fe77663da1fdd59c2b01fc317afcb7bc753 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:08 +0800 Subject: [PATCH 442/450] LoongArch: KVM: Add sign extension with kernel IOCSR read emulation Function kvm_complete_iocsr_read() is to add sign extension with IOCSR read emulation, it is used in user space IOCSR read completion now. Also it should be used in kernel IOCSR read emulation. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 45833fbe16bf..cb493980d874 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -218,16 +218,16 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) } trace_kvm_iocsr(KVM_TRACE_IOCSR_WRITE, run->iocsr_io.len, addr, val); } else { + vcpu->arch.io_gpr = rd; /* Set register id for iocsr read completion */ idx = srcu_read_lock(&vcpu->kvm->srcu); - ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val); + ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, + run->iocsr_io.len, run->iocsr_io.data); srcu_read_unlock(&vcpu->kvm->srcu, idx); - if (ret == 0) + if (ret == 0) { + kvm_complete_iocsr_read(vcpu, run); ret = EMULATE_DONE; - else { + } else ret = EMULATE_DO_IOCSR; - /* Save register id for iocsr read completion */ - vcpu->arch.io_gpr = rd; - } trace_kvm_iocsr(KVM_TRACE_IOCSR_READ, run->iocsr_io.len, addr, NULL); } From 1cf7b2881d832f05b31ec06484774604676189e2 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:08 +0800 Subject: [PATCH 443/450] LoongArch: KVM: Add implementation with IOCSR_IPI_SET IPI IOCSR register IOCSR_IPI_SET can send ipi interrupt to other vCPUs, but it can also send an interrupt to vCPU itself. Indeed there are such operations on Linux as arch_irq_work_raise() which will send ipi message to vCPU itself. Here add implementation of write operation with IOCSR_IPI_SET register. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/ipi.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index 5a8481dda052..e339b6c7b0c3 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -7,12 +7,25 @@ #include #include +static void ipi_set(struct kvm_vcpu *vcpu, uint32_t data) +{ + uint32_t status; + struct kvm_interrupt irq; + + spin_lock(&vcpu->arch.ipi_state.lock); + status = vcpu->arch.ipi_state.status; + vcpu->arch.ipi_state.status |= data; + spin_unlock(&vcpu->arch.ipi_state.lock); + if ((status == 0) && data) { + irq.irq = LARCH_INT_IPI; + kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } +} + static void ipi_send(struct kvm *kvm, uint64_t data) { - int cpu, action; - uint32_t status; + int cpu; struct kvm_vcpu *vcpu; - struct kvm_interrupt irq; cpu = ((data & 0xffffffff) >> 16) & 0x3ff; vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); @@ -21,15 +34,7 @@ static void ipi_send(struct kvm *kvm, uint64_t data) return; } - action = BIT(data & 0x1f); - spin_lock(&vcpu->arch.ipi_state.lock); - status = vcpu->arch.ipi_state.status; - vcpu->arch.ipi_state.status |= action; - spin_unlock(&vcpu->arch.ipi_state.lock); - if (status == 0) { - irq.irq = LARCH_INT_IPI; - kvm_vcpu_ioctl_interrupt(vcpu, &irq); - } + ipi_set(vcpu, BIT(data & 0x1f)); } static void ipi_clear(struct kvm_vcpu *vcpu, uint64_t data) @@ -231,7 +236,7 @@ static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, cons spin_unlock(&vcpu->arch.ipi_state.lock); break; case IOCSR_IPI_SET: - ret = -EINVAL; + ipi_set(vcpu, data); break; case IOCSR_IPI_CLEAR: /* Just clear the status of the current vcpu */ @@ -250,10 +255,10 @@ static int loongarch_ipi_writel(struct kvm_vcpu *vcpu, gpa_t addr, int len, cons ipi_send(vcpu->kvm, data); break; case IOCSR_MAIL_SEND: - ret = mail_send(vcpu->kvm, *(uint64_t *)val); + ret = mail_send(vcpu->kvm, data); break; case IOCSR_ANY_SEND: - ret = any_send(vcpu->kvm, *(uint64_t *)val); + ret = any_send(vcpu->kvm, data); break; default: kvm_err("%s: unknown addr: %llx\n", __func__, addr); From 3da2b0d439aa4eccc2a65e87f0bb280595d91ffe Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:09 +0800 Subject: [PATCH 444/450] LoongArch: KVM: Access mailbox directly in mail_send() With function mail_send(), it is to write mailbox of other VCPUs. Existing simple APIs read_mailbox()/write_mailbox() can be used directly rather than send command on IOCSR address. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/ipi.c | 45 ++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index e339b6c7b0c3..05cefd29282e 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -101,6 +101,34 @@ static void write_mailbox(struct kvm_vcpu *vcpu, int offset, uint64_t data, int spin_unlock(&vcpu->arch.ipi_state.lock); } +static int mail_send(struct kvm *kvm, uint64_t data) +{ + int i, cpu, mailbox, offset; + uint32_t val = 0, mask = 0; + struct kvm_vcpu *vcpu; + + cpu = ((data & 0xffffffff) >> 16) & 0x3ff; + vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return -EINVAL; + } + mailbox = ((data & 0xffffffff) >> 2) & 0x7; + offset = IOCSR_IPI_BUF_20 + mailbox * 4; + if ((data >> 27) & 0xf) { + val = read_mailbox(vcpu, offset, 4); + for (i = 0; i < 4; i++) + if (data & (BIT(27 + i))) + mask |= (0xff << (i * 8)); + val &= mask; + } + + val |= ((uint32_t)(data >> 32) & ~mask); + write_mailbox(vcpu, offset, val, 4); + + return 0; +} + static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) { int i, idx, ret; @@ -137,23 +165,6 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) return ret; } -static int mail_send(struct kvm *kvm, uint64_t data) -{ - int cpu, mailbox, offset; - struct kvm_vcpu *vcpu; - - cpu = ((data & 0xffffffff) >> 16) & 0x3ff; - vcpu = kvm_get_vcpu_by_cpuid(kvm, cpu); - if (unlikely(vcpu == NULL)) { - kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); - return -EINVAL; - } - mailbox = ((data & 0xffffffff) >> 2) & 0x7; - offset = IOCSR_IPI_BASE + IOCSR_IPI_BUF_20 + mailbox * 4; - - return send_ipi_data(vcpu, offset, data); -} - static int any_send(struct kvm *kvm, uint64_t data) { int cpu, offset; From 2f412eb7650c369744420cc2d06404fe3f1fb79f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:09 +0800 Subject: [PATCH 445/450] LoongArch: KVM: Set version information at initial stage Register PCH_PIC_INT_ID constains version and supported irq number information, and it is a read only register. The detailed value can be set at initial stage, rather than read callback. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kvm_pch_pic.h | 15 ++++++++++++- arch/loongarch/kvm/intc/pch_pic.c | 27 ++++++++++++++++-------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/arch/loongarch/include/asm/kvm_pch_pic.h b/arch/loongarch/include/asm/kvm_pch_pic.h index e6df6a4c1c70..7f33a3039272 100644 --- a/arch/loongarch/include/asm/kvm_pch_pic.h +++ b/arch/loongarch/include/asm/kvm_pch_pic.h @@ -34,13 +34,26 @@ #define PCH_PIC_INT_ISR_END 0x3af #define PCH_PIC_POLARITY_START 0x3e0 #define PCH_PIC_POLARITY_END 0x3e7 -#define PCH_PIC_INT_ID_VAL 0x7000000UL +#define PCH_PIC_INT_ID_VAL 0x7UL #define PCH_PIC_INT_ID_VER 0x1UL +union pch_pic_id { + struct { + uint8_t reserved_0[3]; + uint8_t id; + uint8_t version; + uint8_t reserved_1; + uint8_t irq_num; + uint8_t reserved_2; + } desc; + uint64_t data; +}; + struct loongarch_pch_pic { spinlock_t lock; struct kvm *kvm; struct kvm_io_device device; + union pch_pic_id id; uint64_t mask; /* 1:disable irq, 0:enable irq */ uint64_t htmsi_en; /* 1:msi */ uint64_t edge; /* 1:edge triggered, 0:level triggered */ diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index baf3b4faf7ea..f41ef8c42ece 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -120,20 +120,13 @@ static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int l { int offset, index, ret = 0; u32 data = 0; - u64 int_id = 0; offset = addr - s->pch_pic_base; spin_lock(&s->lock); switch (offset) { case PCH_PIC_INT_ID_START ... PCH_PIC_INT_ID_END: - /* int id version */ - int_id |= (u64)PCH_PIC_INT_ID_VER << 32; - /* irq number */ - int_id |= (u64)31 << (32 + 16); - /* int id value */ - int_id |= PCH_PIC_INT_ID_VAL; - *(u64 *)val = int_id; + *(u64 *)val = s->id.data; break; case PCH_PIC_MASK_START ... PCH_PIC_MASK_END: offset -= PCH_PIC_MASK_START; @@ -484,7 +477,7 @@ static int kvm_setup_default_irq_routing(struct kvm *kvm) static int kvm_pch_pic_create(struct kvm_device *dev, u32 type) { - int ret; + int i, ret, irq_num; struct kvm *kvm = dev->kvm; struct loongarch_pch_pic *s; @@ -500,6 +493,22 @@ static int kvm_pch_pic_create(struct kvm_device *dev, u32 type) if (!s) return -ENOMEM; + /* + * Interrupt controller identification register 1 + * Bit 24-31 Interrupt Controller ID + * Interrupt controller identification register 2 + * Bit 0-7 Interrupt Controller version number + * Bit 16-23 The number of interrupt sources supported + */ + irq_num = 32; + s->mask = -1UL; + s->id.desc.id = PCH_PIC_INT_ID_VAL; + s->id.desc.version = PCH_PIC_INT_ID_VER; + s->id.desc.irq_num = irq_num - 1; + for (i = 0; i < irq_num; i++) { + s->route_entry[i] = 1; + s->htmsi_vector[i] = i; + } spin_lock_init(&s->lock); s->kvm = kvm; kvm->arch.pch_pic = s; From eb626c7704dc9dbc2eee2c2bee281992d36de6da Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:09 +0800 Subject: [PATCH 446/450] LoongArch: KVM: Add IRR and ISR register read emulation With LS7A user manual, there are registers PCH_PIC_INT_IRR_START and PCH_PIC_INT_ISR_START. So add read access emulation in function loongarch_pch_pic_read() here. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index f41ef8c42ece..c19cf9d7b08a 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -168,6 +168,12 @@ static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int l /* we only use defalut value 0: high level triggered */ *(u32 *)val = 0; break; + case PCH_PIC_INT_IRR_START: + data = s->irr; + break; + case PCH_PIC_INT_ISR_START: + data = s->isr; + break; default: ret = -EINVAL; } From f8a73df503f5145317de992b94df603ed97e2f86 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:09 +0800 Subject: [PATCH 447/450] LoongArch: KVM: Add different length support in loongarch_pch_pic_read() With function loongarch_pch_pic_read(), currently it is hardcoded length for different registers, and the length comes from exising linux pch_pic driver code. But in theory, all length 1/2/4/8 should be supported for all the registers, here add different length support about register read emulation in function loongarch_pch_pic_read(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 42 ++++++++++++++----------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index c19cf9d7b08a..be63901ab7ab 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -118,55 +118,45 @@ static u32 pch_pic_write_reg(u64 *s, int high, u32 v) static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int len, void *val) { - int offset, index, ret = 0; - u32 data = 0; + int ret = 0, offset; + u64 data = 0; + void *ptemp; offset = addr - s->pch_pic_base; + offset -= offset & 7; spin_lock(&s->lock); switch (offset) { case PCH_PIC_INT_ID_START ... PCH_PIC_INT_ID_END: - *(u64 *)val = s->id.data; + data = s->id.data; break; case PCH_PIC_MASK_START ... PCH_PIC_MASK_END: - offset -= PCH_PIC_MASK_START; - index = offset >> 2; - /* read mask reg */ - data = pch_pic_read_reg(&s->mask, index); - *(u32 *)val = data; + data = s->mask; break; case PCH_PIC_HTMSI_EN_START ... PCH_PIC_HTMSI_EN_END: - offset -= PCH_PIC_HTMSI_EN_START; - index = offset >> 2; /* read htmsi enable reg */ - data = pch_pic_read_reg(&s->htmsi_en, index); - *(u32 *)val = data; + data = s->htmsi_en; break; case PCH_PIC_EDGE_START ... PCH_PIC_EDGE_END: - offset -= PCH_PIC_EDGE_START; - index = offset >> 2; /* read edge enable reg */ - data = pch_pic_read_reg(&s->edge, index); - *(u32 *)val = data; + data = s->edge; break; case PCH_PIC_AUTO_CTRL0_START ... PCH_PIC_AUTO_CTRL0_END: case PCH_PIC_AUTO_CTRL1_START ... PCH_PIC_AUTO_CTRL1_END: /* we only use default mode: fixed interrupt distribution mode */ - *(u32 *)val = 0; break; case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: /* only route to int0: eiointc */ - *(u8 *)val = 1; + ptemp = s->route_entry + (offset - PCH_PIC_ROUTE_ENTRY_START); + data = *(u64 *)ptemp; break; case PCH_PIC_HTMSI_VEC_START ... PCH_PIC_HTMSI_VEC_END: - offset -= PCH_PIC_HTMSI_VEC_START; /* read htmsi vector */ - data = s->htmsi_vector[offset]; - *(u8 *)val = data; + ptemp = s->htmsi_vector + (offset - PCH_PIC_HTMSI_VEC_START); + data = *(u64 *)ptemp; break; case PCH_PIC_POLARITY_START ... PCH_PIC_POLARITY_END: - /* we only use defalut value 0: high level triggered */ - *(u32 *)val = 0; + data = s->polarity; break; case PCH_PIC_INT_IRR_START: data = s->irr; @@ -179,6 +169,12 @@ static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int l } spin_unlock(&s->lock); + if (ret == 0) { + offset = (addr - s->pch_pic_base) & 7; + data = data >> (offset * 8); + memcpy(val, &data, len); + } + return ret; } From f851fdd895dea69e23acc80262da0b8f9e4d04ad Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 23 Sep 2025 23:37:26 +0800 Subject: [PATCH 448/450] LoongArch: KVM: Add different length support in loongarch_pch_pic_write() With function loongarch_pch_pic_write(), currently there is only four bytes register write support. But in theory, all length 1/2/4/8 should be supported for all the registers, here add different length support about register write emulation in function loongarch_pch_pic_write(). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 155 ++++++++++-------------------- 1 file changed, 52 insertions(+), 103 deletions(-) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index be63901ab7ab..f464f1f5a67a 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -77,45 +77,6 @@ void pch_msi_set_irq(struct kvm *kvm, int irq, int level) eiointc_set_irq(kvm->arch.eiointc, irq, level); } -/* - * pch pic register is 64-bit, but it is accessed by 32-bit, - * so we use high to get whether low or high 32 bits we want - * to read. - */ -static u32 pch_pic_read_reg(u64 *s, int high) -{ - u64 val = *s; - - /* read the high 32 bits when high is 1 */ - return high ? (u32)(val >> 32) : (u32)val; -} - -/* - * pch pic register is 64-bit, but it is accessed by 32-bit, - * so we use high to get whether low or high 32 bits we want - * to write. - */ -static u32 pch_pic_write_reg(u64 *s, int high, u32 v) -{ - u64 val = *s, data = v; - - if (high) { - /* - * Clear val high 32 bits - * Write the high 32 bits when the high is 1 - */ - *s = (val << 32 >> 32) | (data << 32); - val >>= 32; - } else - /* - * Clear val low 32 bits - * Write the low 32 bits when the high is 0 - */ - *s = (val >> 32 << 32) | v; - - return (u32)val; -} - static int loongarch_pch_pic_read(struct loongarch_pch_pic *s, gpa_t addr, int len, void *val) { int ret = 0, offset; @@ -205,81 +166,69 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu, static int loongarch_pch_pic_write(struct loongarch_pch_pic *s, gpa_t addr, int len, const void *val) { - int ret; - u32 old, data, offset, index; - u64 irq; + int ret = 0, offset; + u64 old, data, mask; + void *ptemp; - ret = 0; - data = *(u32 *)val; - offset = addr - s->pch_pic_base; + switch (len) { + case 1: + data = *(u8 *)val; + mask = 0xFF; + break; + case 2: + data = *(u16 *)val; + mask = USHRT_MAX; + break; + case 4: + data = *(u32 *)val; + mask = UINT_MAX; + break; + case 8: + default: + data = *(u64 *)val; + mask = ULONG_MAX; + break; + } + + offset = (addr - s->pch_pic_base) & 7; + mask = mask << (offset * 8); + data = data << (offset * 8); + offset = (addr - s->pch_pic_base) - offset; spin_lock(&s->lock); switch (offset) { - case PCH_PIC_MASK_START ... PCH_PIC_MASK_END: - offset -= PCH_PIC_MASK_START; - /* get whether high or low 32 bits we want to write */ - index = offset >> 2; - old = pch_pic_write_reg(&s->mask, index, data); - /* enable irq when mask value change to 0 */ - irq = (old & ~data) << (32 * index); - pch_pic_update_batch_irqs(s, irq, 1); - /* disable irq when mask value change to 1 */ - irq = (~old & data) << (32 * index); - pch_pic_update_batch_irqs(s, irq, 0); + case PCH_PIC_MASK_START: + old = s->mask; + s->mask = (old & ~mask) | data; + if (old & ~data) + pch_pic_update_batch_irqs(s, old & ~data, 1); + if (~old & data) + pch_pic_update_batch_irqs(s, ~old & data, 0); break; - case PCH_PIC_HTMSI_EN_START ... PCH_PIC_HTMSI_EN_END: - offset -= PCH_PIC_HTMSI_EN_START; - index = offset >> 2; - pch_pic_write_reg(&s->htmsi_en, index, data); + case PCH_PIC_HTMSI_EN_START: + s->htmsi_en = (s->htmsi_en & ~mask) | data; break; - case PCH_PIC_EDGE_START ... PCH_PIC_EDGE_END: - offset -= PCH_PIC_EDGE_START; - index = offset >> 2; - /* 1: edge triggered, 0: level triggered */ - pch_pic_write_reg(&s->edge, index, data); + case PCH_PIC_EDGE_START: + s->edge = (s->edge & ~mask) | data; break; - case PCH_PIC_CLEAR_START ... PCH_PIC_CLEAR_END: - offset -= PCH_PIC_CLEAR_START; - index = offset >> 2; - /* write 1 to clear edge irq */ - old = pch_pic_read_reg(&s->irr, index); - /* - * get the irq bitmap which is edge triggered and - * already set and to be cleared - */ - irq = old & pch_pic_read_reg(&s->edge, index) & data; - /* write irr to the new state where irqs have been cleared */ - pch_pic_write_reg(&s->irr, index, old & ~irq); - /* update cleared irqs */ - pch_pic_update_batch_irqs(s, irq, 0); + case PCH_PIC_POLARITY_START: + s->polarity = (s->polarity & ~mask) | data; break; - case PCH_PIC_AUTO_CTRL0_START ... PCH_PIC_AUTO_CTRL0_END: - offset -= PCH_PIC_AUTO_CTRL0_START; - index = offset >> 2; - /* we only use default mode: fixed interrupt distribution mode */ - pch_pic_write_reg(&s->auto_ctrl0, index, 0); - break; - case PCH_PIC_AUTO_CTRL1_START ... PCH_PIC_AUTO_CTRL1_END: - offset -= PCH_PIC_AUTO_CTRL1_START; - index = offset >> 2; - /* we only use default mode: fixed interrupt distribution mode */ - pch_pic_write_reg(&s->auto_ctrl1, index, 0); - break; - case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: - offset -= PCH_PIC_ROUTE_ENTRY_START; - /* only route to int0: eiointc */ - s->route_entry[offset] = 1; + case PCH_PIC_CLEAR_START: + old = s->irr & s->edge & data; + if (old) { + s->irr &= ~old; + pch_pic_update_batch_irqs(s, old, 0); + } break; case PCH_PIC_HTMSI_VEC_START ... PCH_PIC_HTMSI_VEC_END: - /* route table to eiointc */ - offset -= PCH_PIC_HTMSI_VEC_START; - s->htmsi_vector[offset] = (u8)data; + ptemp = s->htmsi_vector + (offset - PCH_PIC_HTMSI_VEC_START); + *(u64 *)ptemp = (*(u64 *)ptemp & ~mask) | data; break; - case PCH_PIC_POLARITY_START ... PCH_PIC_POLARITY_END: - offset -= PCH_PIC_POLARITY_START; - index = offset >> 2; - /* we only use defalut value 0: high level triggered */ - pch_pic_write_reg(&s->polarity, index, 0); + /* Not implemented */ + case PCH_PIC_AUTO_CTRL0_START: + case PCH_PIC_AUTO_CTRL1_START: + case PCH_PIC_ROUTE_ENTRY_START ... PCH_PIC_ROUTE_ENTRY_END: break; default: ret = -EINVAL; From 77336b918f59b9c715d1f752149557a05e0bc0bc Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Tue, 23 Sep 2025 23:37:26 +0800 Subject: [PATCH 449/450] LoongArch: KVM: Rework pch_pic_update_batch_irqs() Use proper bitmap API and drop all the housekeeping code. Reviewed-by: Bibo Mao Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index f464f1f5a67a..a698a73de399 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -35,16 +35,11 @@ static void pch_pic_update_irq(struct loongarch_pch_pic *s, int irq, int level) /* update batch irqs, the irq_mask is a bitmap of irqs */ static void pch_pic_update_batch_irqs(struct loongarch_pch_pic *s, u64 irq_mask, int level) { - int irq, bits; + unsigned int irq; + DECLARE_BITMAP(irqs, 64) = { BITMAP_FROM_U64(irq_mask) }; - /* find each irq by irqs bitmap and update each irq */ - bits = sizeof(irq_mask) * 8; - irq = find_first_bit((void *)&irq_mask, bits); - while (irq < bits) { + for_each_set_bit(irq, irqs, 64) pch_pic_update_irq(s, irq, level); - bitmap_clear((void *)&irq_mask, irq, 1); - irq = find_first_bit((void *)&irq_mask, bits); - } } /* called when a irq is triggered in pch pic */ From 66e2d96b1c5875122bfb94239989d832ccf51477 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Sep 2025 23:37:26 +0800 Subject: [PATCH 450/450] LoongArch: KVM: Move kvm_iocsr tracepoint out of generic code The tracepoint kvm_iocsr is only used by the loongarch architecture. As trace events can take up to 5K of memory, move this tracepoint into the LoongArch specific tracing file so that it doesn't waste memory for all other architectures. Reviewed-by: Bibo Mao Signed-off-by: Steven Rostedt (Google) Signed-off-by: Huacai Chen --- arch/loongarch/kvm/trace.h | 35 +++++++++++++++++++++++++++++++++++ include/trace/events/kvm.h | 35 ----------------------------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/loongarch/kvm/trace.h b/arch/loongarch/kvm/trace.h index 145514dab6d5..3467ee22b704 100644 --- a/arch/loongarch/kvm/trace.h +++ b/arch/loongarch/kvm/trace.h @@ -161,6 +161,41 @@ TRACE_EVENT(kvm_aux, __entry->pc) ); +#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 +#define KVM_TRACE_IOCSR_READ 1 +#define KVM_TRACE_IOCSR_WRITE 2 + +#define kvm_trace_symbol_iocsr \ + { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_IOCSR_READ, "read" }, \ + { KVM_TRACE_IOCSR_WRITE, "write" } + +TRACE_EVENT(kvm_iocsr, + TP_PROTO(int type, int len, u64 gpa, void *val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); + ), + + TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), + __entry->len, __entry->gpa, __entry->val) +); + TRACE_EVENT(kvm_vpid_change, TP_PROTO(struct kvm_vcpu *vcpu, unsigned long vpid), TP_ARGS(vcpu, vpid), diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 8b7252b8d751..b282e3a86769 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -156,41 +156,6 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); -#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 -#define KVM_TRACE_IOCSR_READ 1 -#define KVM_TRACE_IOCSR_WRITE 2 - -#define kvm_trace_symbol_iocsr \ - { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ - { KVM_TRACE_IOCSR_READ, "read" }, \ - { KVM_TRACE_IOCSR_WRITE, "write" } - -TRACE_EVENT(kvm_iocsr, - TP_PROTO(int type, int len, u64 gpa, void *val), - TP_ARGS(type, len, gpa, val), - - TP_STRUCT__entry( - __field( u32, type ) - __field( u32, len ) - __field( u64, gpa ) - __field( u64, val ) - ), - - TP_fast_assign( - __entry->type = type; - __entry->len = len; - __entry->gpa = gpa; - __entry->val = 0; - if (val) - memcpy(&__entry->val, val, - min_t(u32, sizeof(__entry->val), len)); - ), - - TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", - __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), - __entry->len, __entry->gpa, __entry->val) -); - #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"}